def train(args): step_based_schedule = { 100: 2, 200: 3, 300: 4, 400: 2, 500: 3, 600: 1, } ds = build_dataset(args) model, loss, opt = build_ops(args) need_sync = True total_samples = int(MNIST_DATA_SIZE * args.num_epochs) trained_samples = tf.Variable(0) global_step = tf.Variable(0) for local_step, (images, labels) in enumerate(ds): global_step.assign_add(1) trained_samples.assign_add(current_cluster_size() * args.batch_size) loss_value = training_step(model, loss, opt, images, labels) if need_sync: sync_offsets([global_step, trained_samples]) sync_model(model, opt) need_sync = False step = int(global_step) print('step: %d loss: %f' % (step, loss_value)) if step in step_based_schedule: new_size = step_based_schedule[step] need_sync = resize_cluster(new_size) if detached(): break if trained_samples >= total_samples: break
def get_peer_latencies(): """Returns the vector V of round-trip time from this peer to all other peers. For the peer of rank i, V[j] is the RTT from i to j (j != i), V[i] = 0. """ return _op_lib.kungfu_get_peer_latencies( cluster_size=current_cluster_size())
def get_neighbour_mask(edges): """Compute a bool vector of neighbours for the current peer. For the peer of rank i, v[j] = true if (i, j) is an edge of the MST, otherwise v[j] = false. """ return _op_lib.kungfu_get_neighbour_mask( edges, self_rank=current_rank(), cluster_size=current_cluster_size())
def test_all_reduce(device='cpu'): x = torch.ones([2, 3]) x.to(device) y = kf.ops.collective.all_reduce_fn(x) assert (x.shape == y.shape) np = current_cluster_size() z = x * np assert z.equal(y)
def worker(rank): import kungfu.torch as kf from kungfu.python import current_cluster_size, current_rank print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size())) x = torch.ones([]) * int(current_rank()) print(x) y = kf.ops.collective.all_reduce_fn(x) print(y)
def _cluster_size(): if os.getenv('KUNGFU_SELF_SPEC'): from kungfu.python import current_cluster_size return current_cluster_size() else: try: import horovod.tensorflow as hvd return hvd.size() except: return 1
def test_all_gather(device='cpu'): rank = current_rank() x = (torch.ones([2, 3]) * rank) x.to(device) y = kf.ops.collective.all_gather(x) z = [] np = current_cluster_size() for i in range(np): z.append(torch.ones([2, 3]) * i) z = torch.stack(z) assert (z.equal(y))
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KungFu: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KungFu: broadcast the global variable from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp sess.run(BroadcastGlobalVariablesOp()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def end(self): if self._new: return assert (self._begin is not None) dur = time.time() - self._begin new_size = current_cluster_size() print('resize %d -> %d took %s' % (self._old_size, new_size, show_duration(dur))) self._records.append((dur, self._old_size, new_size)) self._begin = None
def worker(rank): from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.ops import all_reduce print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size())) x = tf.Variable(tf.ones(shape=(), dtype=tf.int32)) y = all_reduce(x * rank) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(y) print('v=%s' % (v))
def train_model(model, dataset, n_epochs=1, batch_size=5000): n_shards = current_cluster_size() shard_id = current_rank() train_data_size = len(dataset['x_train']) # calculate the offset for the data of the KungFu node shard_size = train_data_size // n_shards offset = batch_size * shard_id # extract the data for learning of the KungFu node x = dataset['x_train'][offset:offset + shard_size] y = dataset['y_train'][offset:offset + shard_size] # train the model model.fit(x, y, batch_size=batch_size, epochs=n_epochs, callbacks=[BroadcastGlobalVariablesCallback()], validation_data=(dataset['x_val'], dataset['y_val']), verbose=2)
def after_run(self, run_context, run_values): self._step += 1 np = current_cluster_size() self._trained_samples += self._local_batch_size * np self._profiler.begin() changed, keep = run_context.session.run(self._resize_op) if not keep: run_context.request_stop() self._exit_reason = 'change cluster' self._profiler.end() return if changed: self._need_sync = True else: self._profiler.cancel() if self._trained_samples >= self._total_samples: self._exit_reason = 'finished' run_context.request_stop()
def after_run(self, run_context, run_values): sess = run_context.session bs = self.get_batch_size(sess) trained_samples = sess.run(self._trained_samples) trained_samples += bs * current_cluster_size() self._set_trained_samples(sess, trained_samples) self._trained_epochs = int(trained_samples / self._epoch_size) for policy in reversed(self._policies): policy.after_step(sess) if self._trained_epochs > self._last_trained_epochs: for policy in reversed(self._policies): policy.after_epoch(sess) if trained_samples >= self._total_samples: # print('%s' % 'request_stop ...') run_context.request_stop() if detached(): run_context.request_stop()
def test_set_tree(steps, warmup_steps=10): from kungfu.python import current_cluster_size from kungfu.tensorflow.ops import all_reduce, broadcast from kungfu.tensorflow.ops.adapt import set_tree n = current_cluster_size() tree_place = tf.placeholder(dtype=tf.int32, shape=(n, )) set_tree_op = set_tree(broadcast(tree_place)) magic = 32 x = tf.Variable(list(range(magic)), dtype=tf.int32) y = all_reduce(x) init = tf.global_variables_initializer() durations = [] with tf.Session() as sess: sess.run(init) from kungfu._utils import one_based_range for step in one_based_range(steps + warmup_steps): v = sess.run(y) assert (v.sum() == n * magic * (magic - 1) / 2) # print(v) tree = gen_tree(n) t0 = time.time() sess.run(set_tree_op, feed_dict={ tree_place: tree, }) dur = time.time() - t0 if step > warmup_steps: durations.append(dur) ds = np.array([d * 1000 for d in durations]) from kungfu._utils import show_duration print( 'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)' % (len(ds), n, ds.mean(), ds.min(), ds.max()))
def build_optimizer(name, batch_size): learning_rate = 0.1 # Scale learning rate according to the level of data parallelism optimizer = tf.train.GradientDescentOptimizer(learning_rate * current_cluster_size()) # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer return SynchronousSGDOptimizer(optimizer) elif name == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer return PairAveragingOptimizer(optimizer) elif name == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer return SynchronousAveragingOptimizer(optimizer) elif name == 'ada-sgd': from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer return AdaptiveSGDOptimizer(optimizer, change_step=300) else: raise RuntimeError('unknown optimizer: %s' % name)
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \ preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer): '''Single train pipeline of Openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' # train hyper params # dataset params total_step = config.train.n_step batch_size = config.train.batch_size # learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [ 200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000, 900000 ] weight_decay_factor = config.train.weight_decay_factor # log and checkpoint params log_interval = config.log.log_interval vis_interval = config.train.vis_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir # model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors data_format = train_model.data_format model_dir = config.model.model_dir pretrain_model_dir = config.pretrain.pretrain_model_dir pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" # metrics metric_manager = MetricManager() # initializing train dataset train_dataset = dataset.get_train_dataset() epoch_size = dataset.get_train_datasize() // batch_size paramed_map_fn = get_paramed_map_fn(augmentor=augmentor, preprocessor=preprocessor, data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096).repeat() train_dataset = train_dataset.map( paramed_map_fn, num_parallel_calls=get_num_parallel_calls()) train_dataset = train_dataset.batch(config.train.batch_size) train_dataset = train_dataset.prefetch(3) train_dataset_iter = iter(train_dataset) #train configure save_step = tf.Variable(1, trainable=False) save_lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.Adam(learning_rate=save_lr) domainadapt_flag = config.data.domainadapt_flag total_epoch = total_step // epoch_size #domain adaptation params if (not domainadapt_flag): ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt) else: log("Domain adaptaion in training enabled!") # weight param lambda_adapt = 1e-4 # construct discrminator model feature_hin = train_model.hin // train_model.backbone.scale_size feature_win = train_model.win // train_model.backbone.scale_size in_channels = train_model.backbone.out_channels adapt_dis = Discriminator(feature_hin, feature_win, in_channels, data_format=data_format) opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr) ckpt = tf.train.Checkpoint(save_step=save_step, save_lr=save_lr, opt=opt, opt_d=opt_d) # construct domain adaptation dataset dmadapt_train_dataset = dataset.get_dmadapt_train_dataset() paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor) dmadapt_train_dataset = dmadapt_train_dataset.map( paramed_dmadapt_map_fn, num_parallel_calls=get_num_parallel_calls()) dmadapt_train_dataset = dmadapt_train_dataset.shuffle( buffer_size=4096).repeat() dmadapt_train_dataset = dmadapt_train_dataset.batch( config.train.batch_size) dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3) dmadapt_train_dataset_iter = iter(dmadapt_train_dataset) #load from ckpt ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path, network=train_model.backbone, skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized" ) #load model weights try: log("loading saved training model weights...") train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") if (domainadapt_flag): try: log("loading saved domain adaptation discriminator weight...") adapt_dis.load_weights( os.path.join(model_dir, "newest_discriminator.npz")) except: log("discriminator path doesn't exist, discriminator parameters are initialized" ) log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) step = save_step.numpy() lr = save_lr.numpy() #import kungfu from kungfu.python import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer total_step = total_step // current_cluster_size() + 1 # KungFu total_epoch = total_epoch // current_cluster_size() + 1 # KungFu for step_idx, decay_step in enumerate(lr_decay_steps): lr_decay_steps[ step_idx] = decay_step // current_cluster_size() + 1 # KungFu # optimize one step def optimize_step(image, mask, target_x, train_model, metric_manager: MetricManager): # tape with tf.GradientTape() as tape: predict_x = train_model.forward(x=image, is_train=True, ret_backbone=domainadapt_flag) total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \ mask=mask, metric_manager=metric_manager) # optimize model gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) return predict_x def optimize_step_dmadapt(image_src, image_dst, train_model, adapt_dis: Discriminator, metric_manager: MetricManager): # tape with tf.GradientTape(persistent=True) as tape: # feature extraction # src feature predict_src = train_model.forward(x=image_src, is_train=True, ret_backbone=True) backbone_feature_src = predict_src["backbone_features"] adapt_pd_src = adapt_dis.forward(backbone_feature_src) # dst feature predict_dst = train_model.forward(x=image_dst, is_train=True, ret_backbone=True) backbone_feature_dst = predict_dst["backbone_features"] adapt_pd_dst = adapt_dis.forward(backbone_feature_dst) # loss calculation # loss of g g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst, label=True) * lambda_adapt # loss of d d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True) d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False) d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2 # optimize model g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights) opt.apply_gradients(zip(g_gradient, train_model.trainable_weights)) metric_manager.update("model/g_adapt_loss", g_adapt_loss) # optimize dis d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights) opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights)) metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src) metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst) # delete persistent tape del tape return predict_dst # formal training procedure # KungFu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) train_model.train() cur_epoch = step // epoch_size + 1 log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\ +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\ +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" ) for epoch_idx in range(cur_epoch, total_epoch): log(f"Epoch {epoch_idx}/{total_epoch}:") for _ in tqdm(range(0, epoch_size)): step += 1 metric_manager.start_timing() image, mask, target_list = next(train_dataset_iter) # extract gt_label target_list = [ cPickle.loads(target) for target in target_list.numpy() ] target_x = {key: [] for key, value in target_list[0].items()} target_x = reduce( lambda x, y: {key: x[key] + [y[key]] for key, value in x.items()}, [target_x] + target_list) target_x = { key: np.stack(value) for key, value in target_x.items() } target_x = to_tensor_dict(target_x) # learning rate decay if (step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) + 1) lr = lr_init * new_lr_decay # optimize one step predict_x = optimize_step(image, mask, target_x, train_model, metric_manager) # optimize domain adaptation if (domainadapt_flag): src_image = image dst_image = next(dmadapt_train_dataset_iter) predict_dst = optimize_step_dmadapt(src_image, dst_image, train_model, adapt_dis, metric_manager) if (step == 1): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) # log info periodly if ((step != 0) and (step % log_interval) == 0): log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\ +f"{metric_manager.report_train()} ") # visualize periodly if ((step != 0) and (step % vis_interval) == 0 and current_rank() == 0): log(f"Visualizing prediction maps and target maps") visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\ name=f"train_{step}") # save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0 and current_rank() == 0): # save ckpt log("saving model ckpt and result...") save_step.assign(step) save_lr.assign(lr) ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") # save train model model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") # save discriminator model if (domainadapt_flag): dis_save_path = os.path.join(model_dir, "newest_discriminator.npz") adapt_dis.save_weights(dis_save_path) log(f"discriminator save_path:{dis_save_path} saved!\n")
def begin(self): assert (self._begin is None) self._new = False self._begin = time.time() self._old_size = current_cluster_size()
def all_gather(x): np = current_cluster_size() y = x.new(torch.Size([np] + list(x.shape))) all_gather_op_map[x.type()](x, y, x.type()) return y
mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape:
tf.float32), tf.cast(y_train, tf.int64))) train_dataset = train_dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) # KungFu: adjust learning rate based on number of GPUs. opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) # opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=opt, metrics=['accuracy'])
parser = argparse.ArgumentParser(description='Keras MNIST example.') parser.add_argument('--kf-optimizer', type=str, default='sync-sgd', help='kungfu optimizer') args = parser.parse_args() config = tf.ConfigProto() config.gpu_options.allow_growth = True K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # KungFu: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(4.0 / current_cluster_size())) # Input image dimensions img_rows, img_cols = 28, 28 # The data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1)
def local_next(self, bs): cur = self.global_next(bs) rank = kf.current_rank() size = kf.current_cluster_size() local = cur.partition(rank, size) return local
def test_peer_info(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
def worker(rank): from kungfu.python import current_cluster_size, current_rank print('rank=%d' % (rank)) print('kungfu rank: %d, size %d' % (current_rank(), current_cluster_size()))