def begin(self): self._sync_op = BroadcastGlobalVariablesOp() self._step = 0 self._step_place = tf.placeholder(dtype=tf.int32, shape=()) self._sync_step_op = all_reduce(self._step_place, op='max') self._resize_op, self._new_size_op = self._build_resize_op( self._schedule, self._step_place)
def begin(self): self._step = 0 self._trained_samples = 0 self._trained_samples_place = tf.placeholder(dtype=tf.int32, shape=()) self._sync_offset_op = all_reduce(self._trained_samples_place, op='max') self._sync_state_op = BroadcastGlobalVariablesOp() self._resize_op = resize_cluster_from_url()
def begin(self): self._kungfu_step = tf.Variable(0, trainable=False, dtype=tf.int64) self._advance = tf.assign_add(self._kungfu_step, 1) self._sync_op = BroadcastGlobalVariablesOp() ckpt = _get_init_step() self._init_kungfu_step = tf.assign(self._kungfu_step, int(ckpt)) self._resize_op = self._build_resize_op(self._schedule, int(ckpt)) self._reset_global_step = tf.assign(tf.train.get_global_step(), int(ckpt))
def test_sync_sgd(): x = tf.Variable(tf.ones([], tf.float32)) y = x * x optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = SynchronousSGDOptimizer(optimizer) train_op = optimizer.minimize(y) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(BroadcastGlobalVariablesOp()) for _ in range(2): sess.run(train_op)
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KungFu: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KungFu: broadcast the global variable from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp sess.run(BroadcastGlobalVariablesOp()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def test_pair_averaging(): x = tf.Variable(tf.ones([], tf.float32)) y = x * x optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = PairAveragingOptimizer(optimizer) train_op = optimizer.minimize(y) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(BroadcastGlobalVariablesOp()) for _ in range(2): sess.run(train_op) # FIXME: check values run_barrier()
# Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log_final_result(img_sec_mean, img_sec_conf) loss = loss_function() train_opt = opt.minimize(loss) if tf.executing_eagerly(): with tf.device(device): run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) else: init = tf.global_variables_initializer() bcast_op = None if args.kf_optimizer: from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast_op = BroadcastGlobalVariablesOp() with tf.Session(config=config) as session: from kungfu._utils import measure duration, _ = measure(lambda: session.run(init)) log('init took %.3fs' % (duration)) if bcast_op: duration, _ = measure(lambda: session.run(bcast_op)) log('bcast_op took %.3fs' % (duration)) run(lambda: session.run(train_opt)) if barrier_op is not None: session.run(barrier_op)
log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) loss = loss_function() train_opt = opt.minimize(loss) if tf.executing_eagerly(): with tf.device(device): run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables)) else: init = tf.global_variables_initializer() with tf.Session(config=config) as session: session.run(init) if args.kf_optimizer: from kungfu.tensorflow.initializer import \ BroadcastGlobalVariablesOp session.run(BroadcastGlobalVariablesOp()) run(lambda: session.run(train_opt))
def begin(self): global_step = tf.train.get_or_create_global_step() new_global_step = all_reduce(global_step, op='max') self._sync_step_op = tf.assign(global_step, new_global_step) from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp self._sync_state_op = BroadcastGlobalVariablesOp()
def before_train(self): self._size_place = tf.placeholder(dtype=tf.uint32, shape=[]) self._resize_op = resize(self._size_place) self._sync_op = BroadcastGlobalVariablesOp()
def parallel_train(training_dataset, kungfu_option): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer ds = training_dataset.shuffle(buffer_size=4096) ds = ds.shard(num_shards=current_cluster_size(), index=current_rank()) ds = ds.repeat(n_epoch) ds = ds.map(_map_fn, num_parallel_calls=4) ds = ds.batch(batch_size) ds = ds.prefetch(buffer_size=1) iterator = ds.make_one_shot_iterator() one_element = iterator.get_next() net, total_loss, log_tensors = make_model(*one_element, is_train=True, reuse=False) x_ = net.img # net input last_conf = net.last_conf # net output last_paf = net.last_paf # net output confs_ = net.confs # GT pafs_ = net.pafs # GT mask = net.m1 # mask1, GT # net.m2 = m2 # mask2, GT stage_losses = net.stage_losses l2_loss = net.l2_loss global_step = tf.Variable(1, trainable=False) # scaled_lr = lr_init * current_cluster_size() # Horovod: scale the learning rate linearly scaled_lr = lr_init # Linear scaling rule is not working in openpose training. with tf.variable_scope('learning_rate'): lr_v = tf.Variable(scaled_lr, trainable=False) opt = tf.train.MomentumOptimizer(lr_v, 0.9) # KungFu if kungfu_option == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif kungfu_option == 'async-sgd': opt = PairAveragingOptimizer(opt) elif kungfu_option == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown distributed training optimizer.') train_op = opt.minimize(total_loss, global_step=global_step) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Add variable initializer. init = tf.global_variables_initializer() # KungFu from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast = BroadcastGlobalVariablesOp() global n_step, lr_decay_every_step n_step = n_step // current_cluster_size() + 1 # KungFu lr_decay_every_step = lr_decay_every_step // current_cluster_size( ) + 1 # KungFu # Start training with tf.Session(config=config) as sess: init.run() bcast.run() # KungFu print('Worker{}: Initialized'.format(current_rank())) print( 'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}' .format(current_rank(), n_step, batch_size, lr_init, lr_decay_every_step)) # restore pre-trained weights try: # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net) tl.files.load_and_assign_npz_dict(sess=sess, name=os.path.join( model_path, 'pose.npz')) except: print("no pre-trained model") # train until the end while True: step = sess.run(global_step) if step == n_step: break tic = time.time() if step != 0 and (step % lr_decay_every_step == 0): new_lr_decay = lr_decay_factor**(step // lr_decay_every_step) sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay)) [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \ sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf]) # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time())) lr = sess.run(lr_v) print( 'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s' .format(current_rank(), step, n_step, _loss, lr, _l2, time.time() - tic)) for ix, ll in enumerate(_stage_losses): print('Worker{}:', current_rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll) # save intermediate results and model if current_rank() == 0: # KungFu if (step != 0) and (step % save_interval == 0): # save some results [ img_out, confs_ground, pafs_ground, conf_result, paf_result, mask_out ] = sess.run( [x_, confs_, pafs_, last_conf, last_paf, mask]) draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out, 'train_%d_' % step) # save model # tl.files.save_npz( # net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess) # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose' + str(step) + '.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose.npz'), sess=sess)