def model_function(features, labels, mode): # get the model model = lenet() if mode == tf.estimator.ModeKeys.TRAIN: # pass the input through the model logits = model(features) # get the cross-entropy loss and name it loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) tf.identity(loss, 'train_loss') # record the accuracy and name it accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.argmax(logits, axis=1)) tf.identity(accuracy[1], name='train_accuracy') # use Adam to optimize optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) tf.identity(FLAGS.learning_rate, name='learning_rate') # KungFu: Wrap the tf.train.optimizer with KungFu optimizers if FLAGS.kf_optimizer == 'sync_sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer optimizer = SynchronousSGDOptimizer(optimizer) elif FLAGS.kf_optimizer == 'async_sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer optimizer = PairAveragingOptimizer(optimizer) else: raise RuntimeError('Unknown kungfu optimizer') # create an estimator spec to optimize the loss estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=optimizer.minimize(loss, tf.train.get_or_create_global_step())) elif mode == tf.estimator.ModeKeys.EVAL: # pass the input through the model logits = model(features, training=False) # get the cross-entropy loss loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) # use the accuracy as a metric accuracy = tf.metrics.accuracy(labels=labels, predictions=tf.argmax(logits, axis=1)) # create an estimator spec with the loss and accuracy estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss, eval_metric_ops={'accuracy': accuracy}) return estimator_spec
def test_sync_sgd(): x = tf.Variable(tf.ones([], tf.float32)) y = x * x optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = SynchronousSGDOptimizer(optimizer) train_op = optimizer.minimize(y) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(BroadcastGlobalVariablesOp()) for _ in range(2): sess.run(train_op)
def model_fn(features, labels, mode): output, predictions = slp(features['x'], 10) loss = tf.losses.sparse_softmax_cross_entropy(tf.cast(labels, tf.int32), output) eval_metric_ops = { 'accuracy': tf.metrics.accuracy(labels=labels, predictions=predictions) } optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = SynchronousSGDOptimizer(optimizer) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops)
def build_optimizer(shards, use_kungfu=True): learning_rate = 0.1 optimizer = tf.train.GradientDescentOptimizer(learning_rate) if use_kungfu: from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer optimizer = SynchronousSGDOptimizer(optimizer) return optimizer
def sgd_example(): x = tf.Variable(tf.ones([], tf.float32)) y = x * x lr = 0.1 opt = tf.train.GradientDescentOptimizer(learning_rate=lr) opt = SynchronousSGDOptimizer(opt) train_step = opt.minimize(y) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for step in range(5): v, _ = sess.run([x, train_step]) print('step %d, result: %f' % (step, v)) u = (1 - 2 * lr)**(step + 1) if abs(u - v) > 1e-6: print('unexpected result: %f, want: %f' % (v, u))
def test_sync_sgd(): x = tf.Variable(tf.ones([], tf.float32)) opt = tf.keras.optimizers.SGD(0.1) opt = SynchronousSGDOptimizer(opt) @tf.function def training_step(x, opt, first_batch): _training_step(x, opt, first_batch) for batch in range(5): y = training_step(x, opt, batch == 0)
def get_kungfu_opt(kungfu_option,opt): from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer # KungFu configure if kungfu_option == KUNGFU.Sync_sgd: opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: opt=PairAveragingOptimizer(opt) else: raise RuntimeError('Unknown distributed training optimizer.') return opt
def build_optimizer(name, n_workers=1): # Scale learning rate according to the level of data parallelism optimizer = tf.keras.optimizers.SGD(learning_rate=(learning_rate * n_workers)) # KUNGFU: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': return SynchronousSGDOptimizer(optimizer, use_locking=True) elif name == 'async-sgd': return PairAveragingOptimizer(optimizer) elif name == 'sma': return SynchronousAveragingOptimizer(optimizer) else: raise RuntimeError('unknown optimizer: %s' % name)
def _build_optimizer(self, name): learning_rate = 1e-3 decay_rate = 0.99 if name == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) elif name == 'rmsp': optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate) else: optimizer = tf.train.GradientDescentOptimizer(learning_rate) if self._use_kungfu: from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer optimizer = SynchronousSGDOptimizer(optimizer) return optimizer
def build_optimizer(): # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') return opt
def build_ops(args): model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() opt = tf.keras.optimizers.SGD(args.learning_rate) opt = SynchronousSGDOptimizer(opt) return model, loss, opt
def build_optimizer(name, n_shards=1): learning_rate = 0.1 # Scale learning rate according to the level of data parallelism optimizer = tf.train.GradientDescentOptimizer(learning_rate * n_shards) # KUNGFU: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer return SynchronousSGDOptimizer(optimizer) elif name == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer return PairAveragingOptimizer(optimizer, fuse_requests=True) elif name == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer return SynchronousAveragingOptimizer(optimizer) else: raise RuntimeError('unknown optimizer: %s' % name)
def build_model(n_shards): model = Sequential() # 1st layer as the lumpsum weights from resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 # NOTE that this layer will be set below as NOT TRAINABLE, i.e., use it as is model.add(ResNet50(include_top=False, pooling=RESNET50_POOLING_AVERAGE, weights='imagenet')) # 2nd layer as Dense for 2-class classification, i.e., dog or cat using SoftMax activation model.add(Dense(NUM_CLASSES, activation=DENSE_LAYER_ACTIVATION)) # Say not to train first layer (ResNet) model as it is already trained model.layers[0].trainable = False model.summary() sgd = optimizers.SGD(lr=LEARNING_RATE*n_shards, decay=1e-6, momentum=0.9, nesterov=True) sgd_kungfu = SynchronousSGDOptimizer(sgd, use_locking=True) model.compile(optimizer=sgd_kungfu, loss=OBJECTIVE_FUNCTION, metrics=LOSS_METRICS) return model
def build_optimizer(name, batch_size): learning_rate = 0.1 # Scale learning rate according to the level of data parallelism optimizer = tf.train.GradientDescentOptimizer(learning_rate * current_cluster_size()) # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer return SynchronousSGDOptimizer(optimizer) elif name == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer return PairAveragingOptimizer(optimizer) elif name == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer return SynchronousAveragingOptimizer(optimizer) elif name == 'ada-sgd': from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer return AdaptiveSGDOptimizer(optimizer, change_step=300) else: raise RuntimeError('unknown optimizer: %s' % name)
learning_rate = 0.01 if args.optimizer == 'sgd': opt = tf.train.GradientDescentOptimizer(learning_rate) elif args.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate) else: raise Exception('Unknown optimizer option') barrier_op = None if args.kf_optimizer: from kungfu.tensorflow.ops import barrier barrier_op = barrier() if args.kf_optimizer == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'sync-sgd-nccl': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer opt = SynchronousSGDOptimizer(opt, nccl=True, nccl_fusion=args.fuse) elif args.kf_optimizer == 'sync-sgd-hierarchical-nccl': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer opt = SynchronousSGDOptimizer(opt, nccl=True, nccl_fusion=args.fuse, hierarchical_nccl=True) elif args.kf_optimizer == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer opt = PairAveragingOptimizer(opt, fuse_requests=args.fuse) elif args.kf_optimizer == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer opt = SynchronousAveragingOptimizer(opt)
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale=loss_scale) tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) if not keras_utils.is_v2_0(): raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.') elif dtype == tf.bfloat16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) preprocessing_seed = 12345 # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype, drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_preprocessing.input_fn # When `enable_xla` is True, we always drop the remainder of the batches # in the dataset, as XLA-GPU doesn't support dynamic shapes. drop_remainder = flags_obj.enable_xla train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, random_seed=preprocessing_seed, #addition num_workers=current_cluster_size(), #addition worker_ID=current_rank(), #addition tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, dtype=dtype, drop_remainder=drop_remainder) lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) # Build KungFu optimizer opt = common.get_optimizer(lr_schedule) # logging.info(opt.__dict__) optimizer = SynchronousSGDOptimizer(opt, reshape=False, use_locking=True) optimizer._hyper = opt._hyper # logging.info(optimizer.__dict__) if flags_obj.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # TODO(hongkuny): Remove trivial model usage and move it to benchmark. if flags_obj.use_trivial_model: model = trivial_model.trivial_model(imagenet_preprocessing.NUM_CLASSES) else: model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. metrics = (['sparse_categorical_accuracy']) metrics.append('sparse_top_k_categorical_accuracy') if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=metrics, run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) else: model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=metrics, run_eagerly=flags_obj.run_eagerly) # adjust number of steps cluster_size = current_cluster_size() steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) steps_per_epoch = steps_per_epoch // cluster_size train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks(steps_per_epoch, current_rank(), cluster_size, common.learning_rate_schedule) # Broadcast variables for KungFu callbacks.append(BroadcastGlobalVariablesCallback()) # Checkpoint callback only on worker 0 if flags_obj.enable_checkpoint_and_export and current_rank() == 0: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True)) if flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) # Checkpoint only on 0th worker if flags_obj.enable_checkpoint_and_export and current_rank() == 0: if dtype == tf.bfloat16: logging.warning( "Keras model.save does not support bfloat16 dtype.") else: # Keras model.save assumes a float32 input designature. export_path = os.path.join(flags_obj.model_dir, 'saved_model') model.save(export_path, include_optimizer=False) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = common.build_stats(history, eval_output, callbacks) return stats
def build_optimizer(learning_rate=0.01): opt = tf.train.GradientDescentOptimizer(learning_rate) from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer opt = SynchronousSGDOptimizer(opt) return opt
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError( 'dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=cifar_preprocessing.HEIGHT, width=cifar_preprocessing.WIDTH, num_channels=cifar_preprocessing.NUM_CHANNELS, num_classes=cifar_preprocessing.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj), drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = cifar_preprocessing.input_fn #train_input_dataset = input_fn( # is_training=True, # data_dir=flags_obj.data_dir, # batch_size=flags_obj.batch_size, # num_epochs=flags_obj.train_epochs, # parse_record_fn=cifar_preprocessing.parse_record, # datasets_num_private_threads=flags_obj.datasets_num_private_threads, # dtype=dtype, # # Setting drop_remainder to avoid the partial batch logic in normalization # # layer, which triggers tf.where and leads to extra memory copy of input # # sizes between host and GPU. # drop_remainder=(not flags_obj.enable_get_next_as_optional)) # eval_input_dataset = None # if not flags_obj.skip_eval: # eval_input_dataset = input_fn( # is_training=False, # data_dir=flags_obj.data_dir, # batch_size=flags_obj.batch_size, # num_epochs=flags_obj.train_epochs, # parse_record_fn=cifar_preprocessing.parse_record) (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 y_train = tf.keras.utils.to_categorical(y_train, num_classes) y_test = tf.keras.utils.to_categorical(y_test, num_classes) # optimizer = common.get_optimizer() opt = tf.keras.optimizers.SGD(learning_rate=0.1) logging.info(opt.__dict__) optimizer = SynchronousSGDOptimizer(opt, use_locking=True) optimizer._hyper = opt._hyper logging.info(optimizer.__dict__) model = Conv4_model(x_train, num_classes) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=(['accuracy']), run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) else: model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=(['accuracy']), run_eagerly=flags_obj.run_eagerly) cluster_size = current_cluster_size() steps_per_epoch = (cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) steps_per_epoch = steps_per_epoch // cluster_size train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks(steps_per_epoch, current_rank(), cluster_size, learning_rate_schedule) callbacks.append(BroadcastGlobalVariablesCallback()) if flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) # validation_data = eval_input_dataset if flags_obj.skip_eval: if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None tf.compat.v1.logging.info(x_train.shape) history = model.fit(x_train, y_train, batch_size=flags_obj.batch_size, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=(x_test, y_test), validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate((x_test, y_test), steps=num_eval_steps, verbose=2) stats = common.build_stats(history, eval_output, callbacks) return stats
def parallel_train(train_model,dataset,config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = config.train.lr_decay_steps warm_up_step=8000 warm_up_decay=0.01 weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval=config.log.log_interval save_interval=config.train.save_interval vis_dir=config.train.vis_dir #model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout model_dir = config.model.model_dir pretrain_model_dir=config.pretrain.pretrain_model_dir pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer log(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}") #training dataset configure with shuffle,augmentation,and prefetch train_dataset=dataset.get_train_dataset() dataset_type=dataset.get_dataset_type() parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn,num_parallel_calls=max(multiprocessing.cpu_count()//2,1)) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train configure step=tf.Variable(1, trainable=False) lr=tf.Variable(lr_init,trainable=False) lr_init=tf.Variable(lr_init,trainable=False) opt=tf.optimizers.Adam(learning_rate=lr) ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr) ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3) #load from ckpt log("loading ckpt...") try: ckpt.restore(ckpt_manager.latest_checkpoint) log("ckpt loaded successfully!") except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone log("loading pretrained backbone...") try: tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True) log("pretrained backbone loaded successfully") except: log("pretrained backbone doesn't exist, model backbone are initialized") #load model weights log("loading saved training model weights...") try: train_model.load_weights(os.path.join(model_dir,"newest_model.npz")) log("saved training model weights loaded successfully") except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure kungfu_option=config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt=PairAveragingOptimizer(opt) # KungFu adjust n_step = n_step // current_cluster_size() + 1 # KungFu for step_idx,step in enumerate(lr_decay_steps): lr_decay_steps[step_idx] = step // current_cluster_size() + 1 # KungFu for lr_decay_step in lr_decay_steps: if(step>lr_decay_step): lr=lr*lr_decay_factor #optimize one step @tf.function def one_step(image,gt_label,mask,train_model,is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_pif_maps,gt_paf_maps=gt_label pd_pif_maps,pd_paf_maps=train_model.forward(image,is_train=True) loss_pif_maps,loss_paf_maps,total_loss=train_model.cal_loss(pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps) decay_loss=regulize_loss(train_model,weight_decay_factor) total_loss+=decay_loss gradients=tape.gradient(total_loss,train_model.trainable_weights) opt.apply_gradients(zip(gradients,train_model.trainable_weights)) #Kung fu if(is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss #train each step train_model.train() tic=time.time() avg_time=AvgMetric(name="time_iter",metric_interval=log_interval) #total loss metrics avg_total_loss=AvgMetric(name="total_loss",metric_interval=log_interval) #decay loss metrics avg_decay_loss=AvgMetric(name="decay_loss",metric_interval=log_interval) #pif loss metrics avg_pif_conf_loss=AvgMetric(name="pif_conf_loss",metric_interval=log_interval) avg_pif_vec_loss=AvgMetric(name="pif_vec_loss",metric_interval=log_interval) avg_pif_scale_loss=AvgMetric(name="pif_scale_loss",metric_interval=log_interval) #paf loss metrics avg_paf_conf_loss=AvgMetric(name="paf_conf_loss",metric_interval=log_interval) avg_paf_src_vec_loss=AvgMetric(name="paf_src_vec_loss",metric_interval=log_interval) avg_paf_dst_vec_loss=AvgMetric(name="paf_dst_vec_loss",metric_interval=log_interval) avg_paf_src_scale_loss=AvgMetric(name="paf_src_scale_loss",metric_interval=log_interval) avg_paf_dst_scale_loss=AvgMetric(name="paf_dst_scale_loss",metric_interval=log_interval) log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {} weight_decay_factor: {}'.format( n_step, batch_size, lr_init.numpy(), lr_decay_steps, lr_decay_factor, weight_decay_factor)) for image,gt_label,mask,labeled in train_dataset: #get losses pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss=one_step(image,gt_label,mask,train_model,step==0) loss_pif_conf,loss_pif_vec,loss_pif_scale=loss_pif_maps loss_paf_conf,loss_paf_src_vec,loss_paf_dst_vec,loss_paf_src_scale,loss_paf_dst_scale=loss_paf_maps #update metrics avg_time.update(time.time()-tic) tic=time.time() #update total losses avg_total_loss.update(total_loss) #update decay loss avg_decay_loss.update(decay_loss) #update pif_losses metrics avg_pif_conf_loss.update(loss_pif_conf) avg_pif_vec_loss.update(loss_pif_vec) avg_pif_scale_loss.update(loss_pif_scale) #update paf_losses metrics avg_paf_conf_loss.update(loss_paf_conf) avg_paf_src_vec_loss.update(loss_paf_src_vec) avg_paf_dst_vec_loss.update(loss_paf_dst_vec) avg_paf_src_scale_loss.update(loss_paf_src_scale) avg_paf_dst_scale_loss.update(loss_paf_dst_scale) #learning rate decay if(step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step)+1) lr=lr_init*new_lr_decay #warm_up learning rate decay if(step <= warm_up_step): lr=lr_init*warm_up_decay**(1.0-step/warm_up_step) #save log info periodly if((step.numpy()!=0) and (step.numpy()%log_interval)==0): log(f"Train iteration {n_step} / {step.numpy()}, Learning rate:{lr.numpy()} {avg_total_loss.get_metric()} "+\ f"{avg_pif_conf_loss.get_metric()} {avg_pif_vec_loss.get_metric()} {avg_pif_scale_loss.get_metric()}"+\ f"{avg_paf_conf_loss.get_metric()} {avg_paf_src_vec_loss.get_metric()} {avg_paf_dst_vec_loss.get_metric()}"+\ f"{avg_paf_src_scale_loss.get_metric()} {avg_paf_dst_scale_loss.get_metric()} {avg_decay_loss.get_metric()} {avg_time.get_metric()}") #save result and ckpt periodly if((step.numpy()!=0) and (step.numpy()%save_interval)==0): #save ckpt log("saving model ckpt and result...") ckpt_save_path=ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") #save train model model_save_path=os.path.join(model_dir,"newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #draw result stride=train_model.stride gt_pif_maps,gt_paf_maps=gt_label draw_result(image,pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps,mask,parts,limbs,stride,save_dir=vis_dir,\ name=f"train_{step.numpy()}") #training finished if(step==n_step): break
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \ preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer): '''Single train pipeline of Openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' # train hyper params # dataset params total_step = config.train.n_step batch_size = config.train.batch_size # learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [ 200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000, 900000 ] weight_decay_factor = config.train.weight_decay_factor # log and checkpoint params log_interval = config.log.log_interval vis_interval = config.train.vis_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir # model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors data_format = train_model.data_format model_dir = config.model.model_dir pretrain_model_dir = config.pretrain.pretrain_model_dir pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" # metrics metric_manager = MetricManager() # initializing train dataset train_dataset = dataset.get_train_dataset() epoch_size = dataset.get_train_datasize() // batch_size paramed_map_fn = get_paramed_map_fn(augmentor=augmentor, preprocessor=preprocessor, data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096).repeat() train_dataset = train_dataset.map( paramed_map_fn, num_parallel_calls=get_num_parallel_calls()) train_dataset = train_dataset.batch(config.train.batch_size) train_dataset = train_dataset.prefetch(3) train_dataset_iter = iter(train_dataset) #train configure save_step = tf.Variable(1, trainable=False) save_lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.Adam(learning_rate=save_lr) domainadapt_flag = config.data.domainadapt_flag total_epoch = total_step // epoch_size #domain adaptation params if (not domainadapt_flag): ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt) else: log("Domain adaptaion in training enabled!") # weight param lambda_adapt = 1e-4 # construct discrminator model feature_hin = train_model.hin // train_model.backbone.scale_size feature_win = train_model.win // train_model.backbone.scale_size in_channels = train_model.backbone.out_channels adapt_dis = Discriminator(feature_hin, feature_win, in_channels, data_format=data_format) opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr) ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt, opt_d=opt_d) # construct domain adaptation dataset dmadapt_train_dataset = dataset.get_dmadapt_train_dataset() paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor) dmadapt_train_dataset = dmadapt_train_dataset.map( paramed_dmadapt_map_fn, num_parallel_calls=get_num_parallel_calls()) dmadapt_train_dataset = dmadapt_train_dataset.shuffle( buffer_size=4096).repeat() dmadapt_train_dataset = dmadapt_train_dataset.batch( config.train.batch_size) dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3) dmadapt_train_dataset_iter = iter(dmadapt_train_dataset) #load from ckpt ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path, network=train_model.backbone, skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized" ) #load model weights try: log("loading saved training model weights...") train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") if (domainadapt_flag): try: log("loading saved domain adaptation discriminator weight...") adapt_dis.load_weights( os.path.join(model_dir, "newest_discriminator.npz")) except: log("discriminator path doesn't exist, discriminator parameters are initialized" ) log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) step = save_step.numpy() lr = save_lr.numpy() #import kungfu from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer total_step = total_step // current_cluster_size() + 1 # KungFu total_epoch = total_epoch // current_cluster_size() + 1 # KungFu for step_idx, decay_step in enumerate(lr_decay_steps): lr_decay_steps[ step_idx] = decay_step // current_cluster_size() + 1 # KungFu # optimize one step def optimize_step(image, mask, target_x, train_model, metric_manager: MetricManager): # tape with tf.GradientTape() as tape: predict_x = train_model.forward(x=image, is_train=True, ret_backbone=domainadapt_flag) total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \ mask=mask, metric_manager=metric_manager) # optimize model gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) return predict_x def optimize_step_dmadapt(image_src, image_dst, train_model, adapt_dis: Discriminator, metric_manager: MetricManager): # tape with tf.GradientTape(persistent=True) as tape: # feature extraction # src feature predict_src = train_model.forward(x=image_src, is_train=True, ret_backbone=True) backbone_feature_src = predict_src["backbone_features"] adapt_pd_src = adapt_dis.forward(backbone_feature_src) # dst feature predict_dst = train_model.forward(x=image_dst, is_train=True, ret_backbone=True) backbone_feature_dst = predict_dst["backbone_features"] adapt_pd_dst = adapt_dis.forward(backbone_feature_dst) # loss calculation # loss of g g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst, label=True) * lambda_adapt # loss of d d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True) d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False) d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2 # optimize model g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights) opt.apply_gradients(zip(g_gradient, train_model.trainable_weights)) metric_manager.update("model/g_adapt_loss", g_adapt_loss) # optimize dis d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights) opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights)) metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src) metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst) # delete persistent tape del tape return predict_dst # formal training procedure # KungFu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) train_model.train() cur_epoch = step // epoch_size + 1 log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\ +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\ +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" ) for epoch_idx in range(cur_epoch, total_epoch): log(f"Epoch {epoch_idx}/{total_epoch}:") for _ in tqdm(range(0, epoch_size)): step += 1 metric_manager.start_timing() image, mask, target_list = next(train_dataset_iter) # extract gt_label target_list = [ cPickle.loads(target) for target in target_list.numpy() ] target_x = {key: [] for key, value in target_list[0].items()} target_x = reduce( lambda x, y: {key: x[key] + [y[key]] for key, value in x.items()}, [target_x] + target_list) target_x = { key: np.stack(value) for key, value in target_x.items() } target_x = to_tensor_dict(target_x) # learning rate decay if (step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) + 1) lr = lr_init * new_lr_decay # optimize one step predict_x = optimize_step(image, mask, target_x, train_model, metric_manager) # optimize domain adaptation if (domainadapt_flag): src_image = image dst_image = next(dmadapt_train_dataset_iter) predict_dst = optimize_step_dmadapt(src_image, dst_image, train_model, adapt_dis, metric_manager) if (step == 1): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) # log info periodly if ((step != 0) and (step % log_interval) == 0): log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\ +f"{metric_manager.report_train()} ") # visualize periodly if ((step != 0) and (step % vis_interval) == 0 and current_rank() == 0): log(f"Visualizing prediction maps and target maps") visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\ name=f"train_{step}") # save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0 and current_rank() == 0): # save ckpt log("saving model ckpt and result...") save_step.assign(step) save_lr.assign(lr) ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") # save train model model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") # save discriminator model if (domainadapt_flag): dis_save_path = os.path.join(model_dir, "newest_discriminator.npz") adapt_dis.save_weights(dis_save_path) log(f"discriminator save_path:{dis_save_path} saved!\n")
def test_gradient_tape(): x = tf.Variable(tf.ones([], tf.float32)) opt = tf.keras.optimizers.SGD(0.1) opt = SynchronousSGDOptimizer(opt) for batch in range(2): y = training_step(x, opt, batch == 0)
learning_rate = 0.01 if args.optimizer == 'sgd': opt = tf.train.GradientDescentOptimizer(learning_rate) elif args.optimizer == 'adam': opt = tf.train.AdamOptimizer(learning_rate) else: raise Exception('Unknown optimizer option') barrier_op = None if args.kf_optimizer: from kungfu.tensorflow.ops import barrier barrier_op = barrier() if args.kf_optimizer == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'sync-sgd-nccl': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer opt = SynchronousSGDOptimizer(opt, nccl=True, nccl_fusion=args.fuse) elif args.kf_optimizer == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer opt = PairAveragingOptimizer(opt, fuse_requests=args.fuse) elif args.kf_optimizer == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer opt = SynchronousAveragingOptimizer(opt) else: raise Exception('Unknown kungfu option') data = tf.random_uniform([args.batch_size, 224, 224, 3]) target = tf.random_uniform([args.batch_size, 1], minval=0,
help='turn on reshape strategy method') args = parser.parse_args() args.cuda = not args.no_cuda reshape = 1 if args.reshape_on else 0 # Set up standard model. model = getattr(applications, args.model)(weights=None) # opt = tf.optimizers.SGD(0.01) opt = tf.compat.v1.train.GradientDescentOptimizer(0.01) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt,reshape=args.reshape_on, use_locking=True) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) #match this to resnet KF else: raise RuntimeError('Unknown KungFu optimizer') data = tf.random.uniform([args.batch_size, 224, 224, 3]) target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) @tf.function
def parallel_train(train_model, dataset, config): '''Parallel train pipeline of PoseProposal class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval = config.log.log_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir #model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout hnei = train_model.hnei wnei = train_model.wnei model_dir = config.model.model_dir #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer print( f"parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) #training dataset configure with shuffle,augmentation,and prefetch train_dataset = dataset.get_train_dataset() parts, limbs, data_format = train_model.parts, train_model.limbs, train_model.data_format paramed_map_fn = get_paramed_map_fn(hin, win, hout, wout, hnei, wnei, parts, limbs, data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(), index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(buffer_size=2) #train model configure step = tf.Variable(1, trainable=False) lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9) ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr) ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) #load from ckpt try: ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") try: train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") #Kungfu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) n_step = n_step // current_cluster_size() + 1 # KungFu #optimize one step @tf.function def one_step(image, targets, train_model, is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: delta, tx, ty, tw, th, te, te_mask = targets pc, pi, px, py, pw, ph, pe = train_model.forward(image, is_train=True) loss_rsp,loss_iou,loss_coor,loss_size,loss_limb=\ train_model.cal_loss(delta,tx,ty,tw,th,te,te_mask,pc,pi,px,py,pw,ph,pe) pd_loss = loss_rsp + loss_iou + loss_coor + loss_size + loss_limb re_loss = regulize_loss(train_model, weight_decay_factor) total_loss = pd_loss + re_loss gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) #Kung fu if (is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) predicts = (pc, px, py, pw, ph, pe) return predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb #train each step tic = time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log(f'Start - n_step: {n_step} batch_size: {batch_size} lr_init: {lr_init} lr_decay_factor: {lr_decay_factor}' ) avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0. for image, targets in train_dataset: #learning rate decay lr = lr_init * (1 - step / n_step * lr_decay_factor) #optimize one step predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb = one_step( image, targets, train_model) avg_loss_rsp += loss_rsp / log_interval avg_loss_iou += loss_iou / log_interval avg_loss_coor += loss_coor / log_interval avg_loss_size += loss_size / log_interval avg_loss_limb += loss_limb / log_interval avg_pd_loss += pd_loss / log_interval avg_re_loss += re_loss / log_interval #save log info periodly if ((step != 0) and (step % log_interval) == 0): tic = time.time() log(f"worker:{current_rank()} Train iteration {step.numpy()}/{n_step}, learning rate:{lr.numpy()},"+\ f"loss_rsp:{avg_loss_rsp},loss_iou:{avg_loss_iou},loss_coor:{avg_loss_coor},loss_size:{avg_loss_size},"+\ f"loss_limb:{avg_loss_limb},loss_pd:{avg_pd_loss},loss_re:{avg_re_loss} ,time:{time.time()-tic}") avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0. #save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0): log("saving model ckpt and result...") draw_results(image.numpy(), predicts, targets, parts, limbs, save_dir=vis_dir, name=f"ppn_step_{step.numpy()}") ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if (step == n_step): break
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) grads = tape.gradient(loss_value, mnist_model.trainable_variables)
def parallel_train(train_model,dataset,config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [200000,300000,360000,420000,480000,540000,600000,700000,800000,900000] weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval=config.log.log_interval save_interval=config.train.save_interval vis_dir=config.train.vis_dir #model hyper params n_pos = train_model.n_pos hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout model_dir = config.model.model_dir pretrain_model_dir=config.pretrain.pretrain_model_dir pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer print(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}") #training dataset configure with shuffle,augmentation,and prefetch train_dataset=dataset.get_train_dataset() dataset_type=dataset.get_dataset_type() parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format flip_list=get_flip_list(dataset_type) paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,flip_list=flip_list,data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train model configure step=tf.Variable(1, trainable=False) lr=tf.Variable(lr_init,trainable=False) if(config.model.model_type==MODEL.Openpose): opt=tf.keras.optimizers.RMSprop(learning_rate=lr) else: opt=tf.keras.optimizers.Adam(learning_rate=lr) ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr) ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3) #load from ckpt try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized") #load model weights try: train_model.load_weights(os.path.join(model_dir,"newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure kungfu_option=config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt=PairAveragingOptimizer(opt) n_step = n_step // current_cluster_size() + 1 # KungFu for step_idx,step in enumerate(lr_decay_steps): lr_decay_steps[step_idx] = step // current_cluster_size() + 1 # KungFu #optimize one step @tf.function def one_step(image,gt_label,mask,train_model,is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_conf=gt_label[:,:n_pos,:,:] gt_paf=gt_label[:,n_pos:,:,:] pd_conf,pd_paf,stage_confs,stage_pafs=train_model.forward(image,is_train=True) pd_loss,loss_confs,loss_pafs=train_model.cal_loss(gt_conf,gt_paf,mask,stage_confs,stage_pafs) re_loss=regulize_loss(train_model,weight_decay_factor) total_loss=pd_loss+re_loss gradients=tape.gradient(total_loss,train_model.trainable_weights) opt.apply_gradients(zip(gradients,train_model.trainable_weights)) #Kung fu if(is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss #train each step tic=time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {}'.format( n_step, batch_size, lr_init, lr_decay_steps, lr_decay_factor)) for image,gt_label,mask in train_dataset: #learning rate decay if(step in lr_decay_steps): new_lr_decay = lr_decay_factor**(float(lr_decay_steps.index(step)+1)) lr=lr_init*new_lr_decay #optimize one step gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\ train_model,step==0) #save log info periodly if((step.numpy()!=0) and (step.numpy()%log_interval)==0): tic=time.time() log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}'.format( step.numpy(), n_step, total_loss, lr.numpy(), re_loss,time.time()-tic)) #save result and ckpt periodly if((step!=0) and (step%save_interval)==0 and current_rank()==0): log("saving model ckpt and result...") draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\ vis_dir,'train_%d_' % step) ckpt_save_path=ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path=os.path.join(model_dir,"newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if(step==n_step): break
def parallel_train(training_dataset, kungfu_option): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer ds = training_dataset.shuffle(buffer_size=4096) ds = ds.shard(num_shards=current_cluster_size(), index=current_rank()) ds = ds.repeat(n_epoch) ds = ds.map(_map_fn, num_parallel_calls=4) ds = ds.batch(batch_size) ds = ds.prefetch(buffer_size=1) iterator = ds.make_one_shot_iterator() one_element = iterator.get_next() net, total_loss, log_tensors = make_model(*one_element, is_train=True, reuse=False) x_ = net.img # net input last_conf = net.last_conf # net output last_paf = net.last_paf # net output confs_ = net.confs # GT pafs_ = net.pafs # GT mask = net.m1 # mask1, GT # net.m2 = m2 # mask2, GT stage_losses = net.stage_losses l2_loss = net.l2_loss global_step = tf.Variable(1, trainable=False) # scaled_lr = lr_init * current_cluster_size() # Horovod: scale the learning rate linearly scaled_lr = lr_init # Linear scaling rule is not working in openpose training. with tf.variable_scope('learning_rate'): lr_v = tf.Variable(scaled_lr, trainable=False) opt = tf.train.MomentumOptimizer(lr_v, 0.9) # KungFu if kungfu_option == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif kungfu_option == 'async-sgd': opt = PairAveragingOptimizer(opt) elif kungfu_option == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown distributed training optimizer.') train_op = opt.minimize(total_loss, global_step=global_step) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Add variable initializer. init = tf.global_variables_initializer() # KungFu from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast = BroadcastGlobalVariablesOp() global n_step, lr_decay_every_step n_step = n_step // current_cluster_size() + 1 # KungFu lr_decay_every_step = lr_decay_every_step // current_cluster_size( ) + 1 # KungFu # Start training with tf.Session(config=config) as sess: init.run() bcast.run() # KungFu print('Worker{}: Initialized'.format(current_rank())) print( 'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}' .format(current_rank(), n_step, batch_size, lr_init, lr_decay_every_step)) # restore pre-trained weights try: # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net) tl.files.load_and_assign_npz_dict(sess=sess, name=os.path.join( model_path, 'pose.npz')) except: print("no pre-trained model") # train until the end while True: step = sess.run(global_step) if step == n_step: break tic = time.time() if step != 0 and (step % lr_decay_every_step == 0): new_lr_decay = lr_decay_factor**(step // lr_decay_every_step) sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay)) [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \ sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf]) # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time())) lr = sess.run(lr_v) print( 'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s' .format(current_rank(), step, n_step, _loss, lr, _l2, time.time() - tic)) for ix, ll in enumerate(_stage_losses): print('Worker{}:', current_rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll) # save intermediate results and model if current_rank() == 0: # KungFu if (step != 0) and (step % save_interval == 0): # save some results [ img_out, confs_ground, pafs_ground, conf_result, paf_result, mask_out ] = sess.run( [x_, confs_, pafs_, last_conf, last_paf, mask]) draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out, 'train_%d_' % step) # save model # tl.files.save_npz( # net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess) # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose' + str(step) + '.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose.npz'), sess=sess)
model.add( Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape)) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation='softmax')) # KungFu: adjust learning rate based on number of GPUs. opt = keras.optimizers.Adadelta(1.0 * current_cluster_size()) # KungFu: wrap distributed optimizers. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt, with_keras=True) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt, with_keras=True) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt, with_keras=True) else: raise RuntimeError('unknown optimizer: %s' % name) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [BroadcastGlobalVariablesCallback(with_keras=True)] # KungFu: save checkpoints only on worker 0 to prevent other workers from corrupting them. if current_rank() == 0: