def process_data(data_dir, sharding=True): ds = tf.data.Dataset.list_files(str(data_dir + "/*/*")) if sharding: ds = ds.shard(num_shards = current_cluster_size(), index=current_rank()) ds = ds.map(process_path, num_parallel_calls=AUTOTUNE) ds = ds.batch(BATCH_SIZE_TRAINING) return ds
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KUNGFU: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KUNGFU: KungFu initilizer defines how model weights are initilised on distributed devices if hasattr(optimizer, 'distributed_initializer'): sess.run(optimizer.distributed_initializer()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def train_mnist(sess, x, y_, train_op, test_op, optimizer, dataset, n_epochs=1, batch_size=5000): log_period = 100 # get the cluster size n_shards = current_cluster_size() # get the cluster rank of the node shard_id = current_rank() # calculate number of datapoints per node training_set_size = dataset['training_set']['x'].shape[0] shard_size = training_set_size // n_shards step_per_epoch = shard_size // batch_size n_steps = step_per_epoch * n_epochs print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps)) # KungFu: Each replica is responsible for a data shard. offset = batch_size * shard_id sess.run(tf.global_variables_initializer()) # KungFu: broadcast the global variable from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp sess.run(BroadcastGlobalVariablesOp()) print('training') # train the model with all batches allocated to the node for step in range(n_steps): xs = dataset['training_set']['x'][offset:offset + batch_size] y_s = dataset['training_set']['y'][offset:offset + batch_size] offset = (offset + batch_size * n_shards) % training_set_size sess.run(train_op, { x: xs, y_: y_s, }) # log the validation accuracy if step % log_period == 0: training_acc_dataset = dict() training_acc_dataset['x'] = xs training_acc_dataset['y'] = y_s result = test_mnist(sess, x, y_, test_op, training_acc_dataset) print('training accuracy: %f' % result) result = test_mnist(sess, x, y_, test_op, dataset['validation_set']) print('validation accuracy: %f' % result)
def test_group_all_gather(): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.ops import all_gather rank = current_rank() np = current_cluster_size() sizes = [i + 1 for i in range(5)] xs = [(rank + 1) * tf.Variable(tf.ones([n], tf.int32)) for n in sizes] ys = [all_gather(x) for x in xs] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i, y in enumerate(ys): v = sess.run(y) assert (v.sum() == (np + 1) * np / 2 * (i + 1))
def test_consensus(): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.ops import consensus np = current_cluster_size() rank = current_rank() x = tf.Variable(rank, dtype=tf.int32) consensus_check = consensus(x) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) v = sess.run(consensus_check) assert v == (np == 1)
def build_optimizer(): # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') return opt
def train_model(model, dataset, n_epochs=1, batch_size=5000): n_shards = current_cluster_size() shard_id = current_rank() train_data_size = len(dataset['x_train']) # calculate the offset for the data of the KungFu node shard_size = train_data_size // n_shards offset = batch_size * shard_id # extract the data for learning of the KungFu node x = dataset['x_train'][offset:offset + shard_size] y = dataset['y_train'][offset:offset + shard_size] # train the model model.fit(x, y, batch_size=batch_size, epochs=n_epochs, validation_data=(dataset['x_val'], dataset['y_val']), verbose=2)
def load_data_per_node(x_train, y_train, dataset_size, backup_worker_id, backup_frac): # custom size of the training data x_train, y_train = x_train[:DATASET_SIZE], y_train[:DATASET_SIZE] # shard the dataset for KungFu node n_shards = current_cluster_size() shard_id = current_rank() train_data_size = len(x_train) shard_size = train_data_size // n_shards offset = shard_size * shard_id # extract the data for learning of the KungFu primary nodes x_node = x_train[offset:offset + shard_size] y_node = y_train[offset:offset + shard_size] num_images_backup = int(train_data_size * backup_frac) # extract the data for learning of the KungFu backup nodes frac_data_per_worker = 1 / n_shards repeat_nums = frac_data_per_worker // backup_frac remainder = int( round(train_data_size * (frac_data_per_worker - backup_frac * repeat_nums))) print("info : ", frac_data_per_worker, repeat_nums, backup_frac * repeat_nums, remainder, train_data_size) if shard_id == backup_worker_id: x_distinct = x_train[offset:offset + shard_size][0:num_images_backup] y_distinct = y_train[offset:offset + shard_size][0:num_images_backup] x_repeat = x_distinct.repeat(repeat_nums, axis=0) y_repeat = y_distinct.repeat(repeat_nums, axis=0) x_node = np.concatenate((x_repeat, x_distinct[0:remainder]), axis=0) y_node = np.concatenate((y_repeat, y_distinct[0:remainder]), axis=0) print("Worker ID {} | start idx {} | end idx {} ".format( shard_id, offset, offset + shard_size)) print("Training set size:", x_node.shape, y_node.shape) return x_node, y_node
def test_set_tree(steps, warmup_steps=10): from kungfu import current_cluster_size from kungfu.tensorflow.ops import all_reduce, broadcast from kungfu.tensorflow.ops.adapt import set_tree n = current_cluster_size() tree_place = tf.placeholder(dtype=tf.int32, shape=(n, )) set_tree_op = set_tree(broadcast(tree_place)) magic = 32 x = tf.Variable(list(range(magic)), dtype=tf.int32) y = all_reduce(x) init = tf.global_variables_initializer() durations = [] with tf.Session() as sess: sess.run(init) from kungfu._utils import one_based_range for step in one_based_range(steps + warmup_steps): v = sess.run(y) assert (v.sum() == n * magic * (magic - 1) / 2) # print(v) tree = gen_tree(n) t0 = time.time() sess.run(set_tree_op, feed_dict={ tree_place: tree, }) dur = time.time() - t0 if step > warmup_steps: durations.append(dur) ds = np.array([d * 1000 for d in durations]) from kungfu._utils import show_duration print( 'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)' % (len(ds), n, ds.mean(), ds.min(), ds.max()))
def build_optimizer(name, batch_size): learning_rate = 0.1 # Scale learning rate according to the level of data parallelism optimizer = tf.train.GradientDescentOptimizer(learning_rate * current_cluster_size()) # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers. if name == 'sync-sgd': from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer return SynchronousSGDOptimizer(optimizer) elif name == 'async-sgd': from kungfu.tensorflow.optimizers import PairAveragingOptimizer return PairAveragingOptimizer(optimizer) elif name == 'sma': from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer return SynchronousAveragingOptimizer(optimizer) elif name == 'ada-sgd': from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer return AdaptiveSGDOptimizer(optimizer, change_step=300) else: raise RuntimeError('unknown optimizer: %s' % name)
mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() # KungFu: adjust learning rate based on number of GPUs. # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) # KungFu: wrap tf.compat.v1.train.Optimizer. if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape:
def test_peer_info(): rank = current_rank() np = current_cluster_size() print('rank=%d, np=%d' % (rank, np))
parser = argparse.ArgumentParser(description='Keras MNIST example.') parser.add_argument('--kf-optimizer', type=str, default='sync-sgd', help='kungfu optimizer') args = parser.parse_args() config = tf.ConfigProto() config.gpu_options.allow_growth = True K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # KungFu: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(4.0 / current_cluster_size())) # Input image dimensions img_rows, img_cols = 28, 28 # The data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1)
# cb_checkpointer = ModelCheckpoint( # filepath='../working/best.hdf5', monitor='val_loss', save_best_only=True, mode='auto') # Accumulate history of all permutations (may be for viewing trend) and keep watching for lowest val_loss as final model model.fit( train_dataset, epochs=epochs, validation_data=val_dataset, callbacks=[BroadcastGlobalVariablesCallback()] ) # model.load_weights("../working/best.hdf5") if __name__ == "__main__": n_shards = current_cluster_size() model = build_model(n_shards) train_dataset = process_data(TRAIN_DIR) val_dataset = process_data(VAL_DIR, sharding=False) train_model(model, train_dataset, val_dataset, epochs=NUM_EPOCHS) """### Keras Limitations * [10/02/2018] The *validation_split* is not supported in *fit_generator*, hence its expects ImageDataGenerator for pre-splitted train & valid. * [10/02/2018] Model learning through *fit_generator* is not compatible for Sklearn *GridSearchCV* again *mostly* due to no support for *validation_split*. ### Followup Plan 1. Scale and pad and avoid aspect ratio change of original image through Keras ImageDataGenerator pre-processing insfrastructure 2. Image augmentation
parser.add_argument('--kf-optimizer', type=str, default='sync-sgd', help='available options: sync-sgd, async-sgd, sma') parser.add_argument('--name', type=str, required=True, help='name this experiement run for Tensorboard logging') args = parser.parse_args() DATASET_SIZE = 300 TRAIN_VAL_SPLIT = 0.8 NUM_EPOCHS = 15 BATCH_SIZE = 8 # adjust number of steps based on number of workers NUM_STEPS = (DATASET_SIZE // BATCH_SIZE) // current_cluster_size() def load_data(): (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % current_rank()) print(len(mnist_images)) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) # smaller dataset for quick testing smaller_dataset = dataset.take(DATASET_SIZE) split = int(DATASET_SIZE * TRAIN_VAL_SPLIT)
def parallel_train(train_model,dataset,config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = config.train.lr_decay_steps warm_up_step=8000 warm_up_decay=0.01 weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval=config.log.log_interval save_interval=config.train.save_interval vis_dir=config.train.vis_dir #model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout model_dir = config.model.model_dir pretrain_model_dir=config.pretrain.pretrain_model_dir pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer log(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}") #training dataset configure with shuffle,augmentation,and prefetch train_dataset=dataset.get_train_dataset() dataset_type=dataset.get_dataset_type() parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn,num_parallel_calls=max(multiprocessing.cpu_count()//2,1)) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train configure step=tf.Variable(1, trainable=False) lr=tf.Variable(lr_init,trainable=False) lr_init=tf.Variable(lr_init,trainable=False) opt=tf.optimizers.Adam(learning_rate=lr) ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr) ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3) #load from ckpt log("loading ckpt...") try: ckpt.restore(ckpt_manager.latest_checkpoint) log("ckpt loaded successfully!") except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone log("loading pretrained backbone...") try: tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True) log("pretrained backbone loaded successfully") except: log("pretrained backbone doesn't exist, model backbone are initialized") #load model weights log("loading saved training model weights...") try: train_model.load_weights(os.path.join(model_dir,"newest_model.npz")) log("saved training model weights loaded successfully") except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure kungfu_option=config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt=PairAveragingOptimizer(opt) # KungFu adjust n_step = n_step // current_cluster_size() + 1 # KungFu for step_idx,step in enumerate(lr_decay_steps): lr_decay_steps[step_idx] = step // current_cluster_size() + 1 # KungFu for lr_decay_step in lr_decay_steps: if(step>lr_decay_step): lr=lr*lr_decay_factor #optimize one step @tf.function def one_step(image,gt_label,mask,train_model,is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_pif_maps,gt_paf_maps=gt_label pd_pif_maps,pd_paf_maps=train_model.forward(image,is_train=True) loss_pif_maps,loss_paf_maps,total_loss=train_model.cal_loss(pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps) decay_loss=regulize_loss(train_model,weight_decay_factor) total_loss+=decay_loss gradients=tape.gradient(total_loss,train_model.trainable_weights) opt.apply_gradients(zip(gradients,train_model.trainable_weights)) #Kung fu if(is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss #train each step train_model.train() tic=time.time() avg_time=AvgMetric(name="time_iter",metric_interval=log_interval) #total loss metrics avg_total_loss=AvgMetric(name="total_loss",metric_interval=log_interval) #decay loss metrics avg_decay_loss=AvgMetric(name="decay_loss",metric_interval=log_interval) #pif loss metrics avg_pif_conf_loss=AvgMetric(name="pif_conf_loss",metric_interval=log_interval) avg_pif_vec_loss=AvgMetric(name="pif_vec_loss",metric_interval=log_interval) avg_pif_scale_loss=AvgMetric(name="pif_scale_loss",metric_interval=log_interval) #paf loss metrics avg_paf_conf_loss=AvgMetric(name="paf_conf_loss",metric_interval=log_interval) avg_paf_src_vec_loss=AvgMetric(name="paf_src_vec_loss",metric_interval=log_interval) avg_paf_dst_vec_loss=AvgMetric(name="paf_dst_vec_loss",metric_interval=log_interval) avg_paf_src_scale_loss=AvgMetric(name="paf_src_scale_loss",metric_interval=log_interval) avg_paf_dst_scale_loss=AvgMetric(name="paf_dst_scale_loss",metric_interval=log_interval) log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {} weight_decay_factor: {}'.format( n_step, batch_size, lr_init.numpy(), lr_decay_steps, lr_decay_factor, weight_decay_factor)) for image,gt_label,mask,labeled in train_dataset: #get losses pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss=one_step(image,gt_label,mask,train_model,step==0) loss_pif_conf,loss_pif_vec,loss_pif_scale=loss_pif_maps loss_paf_conf,loss_paf_src_vec,loss_paf_dst_vec,loss_paf_src_scale,loss_paf_dst_scale=loss_paf_maps #update metrics avg_time.update(time.time()-tic) tic=time.time() #update total losses avg_total_loss.update(total_loss) #update decay loss avg_decay_loss.update(decay_loss) #update pif_losses metrics avg_pif_conf_loss.update(loss_pif_conf) avg_pif_vec_loss.update(loss_pif_vec) avg_pif_scale_loss.update(loss_pif_scale) #update paf_losses metrics avg_paf_conf_loss.update(loss_paf_conf) avg_paf_src_vec_loss.update(loss_paf_src_vec) avg_paf_dst_vec_loss.update(loss_paf_dst_vec) avg_paf_src_scale_loss.update(loss_paf_src_scale) avg_paf_dst_scale_loss.update(loss_paf_dst_scale) #learning rate decay if(step in lr_decay_steps): new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step)+1) lr=lr_init*new_lr_decay #warm_up learning rate decay if(step <= warm_up_step): lr=lr_init*warm_up_decay**(1.0-step/warm_up_step) #save log info periodly if((step.numpy()!=0) and (step.numpy()%log_interval)==0): log(f"Train iteration {n_step} / {step.numpy()}, Learning rate:{lr.numpy()} {avg_total_loss.get_metric()} "+\ f"{avg_pif_conf_loss.get_metric()} {avg_pif_vec_loss.get_metric()} {avg_pif_scale_loss.get_metric()}"+\ f"{avg_paf_conf_loss.get_metric()} {avg_paf_src_vec_loss.get_metric()} {avg_paf_dst_vec_loss.get_metric()}"+\ f"{avg_paf_src_scale_loss.get_metric()} {avg_paf_dst_scale_loss.get_metric()} {avg_decay_loss.get_metric()} {avg_time.get_metric()}") #save result and ckpt periodly if((step.numpy()!=0) and (step.numpy()%save_interval)==0): #save ckpt log("saving model ckpt and result...") ckpt_save_path=ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") #save train model model_save_path=os.path.join(model_dir,"newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #draw result stride=train_model.stride gt_pif_maps,gt_paf_maps=gt_label draw_result(image,pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps,mask,parts,limbs,stride,save_dir=vis_dir,\ name=f"train_{step.numpy()}") #training finished if(step==n_step): break
def parallel_train(train_model, dataset, config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_every_step = config.train.lr_decay_every_step weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval = config.log.log_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir #model hyper params n_pos = train_model.n_pos hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout data_format = train_model.data_format model_dir = config.model.model_dir #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables #training dataset configure with shuffle,augmentation,and prefetch train_dataset = dataset.get_train_dataset() dataset_type = dataset.get_dataset_type() parts, limbs, kpt_cvter = get_parts(dataset_type), get_limbs( dataset_type), get_input_kptcvter(dataset_type) flip_list = get_flip_list(dataset_type) paramed_map_fn = get_paramed_map_fn(hin, win, hout, wout, parts, limbs, kpt_cvter, flip_list=flip_list, data_format=dataset_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(), index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train model configure step = tf.Variable(1, trainable=False) lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9) ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr) ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) #load from ckpt try: ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") try: train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure opt = get_kungfu_opt(config.train.kungfu_option, opt) n_step = n_step // current_cluster_size() + 1 # KungFu lr_decay_every_step = lr_decay_every_step // current_cluster_size( ) + 1 # KungFu #optimize one step @tf.function def one_step(image, gt_label, mask, train_model, is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_conf = gt_label[:, :n_pos, :, :, ] gt_paf = gt_label[:, n_pos:, :, :] pd_conf, pd_paf, stage_confs, stage_pafs = train_model.forward( image, is_train=True) pd_loss = train_model.cal_loss(gt_conf, gt_paf, mask, stage_confs, stage_pafs) re_loss = regulize_loss(train_model, weight_decay_factor) total_loss = pd_loss + re_loss gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) #Kung fu if (is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return gt_conf, gt_paf, pd_conf, pd_paf, total_loss, re_loss #train each step tic = time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}' .format(n_step, batch_size, lr_init, lr_decay_every_step)) for image, gt_label, mask in train_dataset: #learning rate decay if (step % lr_decay_every_step == 0): new_lr_decay = lr_decay_factor**(step // lr_decay_every_step) lr = lr_init * new_lr_decay #optimize one step gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\ train_model,step==0) #save log info periodly if ((step != 0) and (step % log_interval) == 0): tic = time.time() log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}' .format(step.numpy(), n_step, total_loss, lr.numpy(), re_loss, time.time() - tic)) #save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0 and current_rank() == 0): log("saving model ckpt and result...") draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\ vis_dir,'train_%d_' % step) ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if (step == n_step): break
tf.float32), tf.cast(y_train, tf.int64))) train_dataset = train_dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) # KungFu: adjust learning rate based on number of GPUs. opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size()) # opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size()) if args.kf_optimizer == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif args.kf_optimizer == 'async-sgd': opt = PairAveragingOptimizer(opt) elif args.kf_optimizer == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown KungFu optimizer') mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=opt, metrics=['accuracy'])
def run(flags_obj): """Run ResNet Cifar-10 training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == 'fp16': raise ValueError( 'dtype fp16 is not supported in Keras. Use the default ' 'value(fp32).') data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=cifar_preprocessing.HEIGHT, width=cifar_preprocessing.WIDTH, num_channels=cifar_preprocessing.NUM_CHANNELS, num_classes=cifar_preprocessing.NUM_CLASSES, dtype=flags_core.get_tf_dtype(flags_obj), drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = cifar_preprocessing.input_fn #train_input_dataset = input_fn( # is_training=True, # data_dir=flags_obj.data_dir, # batch_size=flags_obj.batch_size, # num_epochs=flags_obj.train_epochs, # parse_record_fn=cifar_preprocessing.parse_record, # datasets_num_private_threads=flags_obj.datasets_num_private_threads, # dtype=dtype, # # Setting drop_remainder to avoid the partial batch logic in normalization # # layer, which triggers tf.where and leads to extra memory copy of input # # sizes between host and GPU. # drop_remainder=(not flags_obj.enable_get_next_as_optional)) # eval_input_dataset = None # if not flags_obj.skip_eval: # eval_input_dataset = input_fn( # is_training=False, # data_dir=flags_obj.data_dir, # batch_size=flags_obj.batch_size, # num_epochs=flags_obj.train_epochs, # parse_record_fn=cifar_preprocessing.parse_record) (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 y_train = tf.keras.utils.to_categorical(y_train, num_classes) y_test = tf.keras.utils.to_categorical(y_test, num_classes) # optimizer = common.get_optimizer() opt = tf.keras.optimizers.SGD(learning_rate=0.1) logging.info(opt.__dict__) optimizer = SynchronousSGDOptimizer(opt, use_locking=True) optimizer._hyper = opt._hyper logging.info(optimizer.__dict__) model = Conv4_model(x_train, num_classes) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=(['accuracy']), run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) else: model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=(['accuracy']), run_eagerly=flags_obj.run_eagerly) cluster_size = current_cluster_size() steps_per_epoch = (cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) steps_per_epoch = steps_per_epoch // cluster_size train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks(steps_per_epoch, current_rank(), cluster_size, learning_rate_schedule) callbacks.append(BroadcastGlobalVariablesCallback()) if flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) # validation_data = eval_input_dataset if flags_obj.skip_eval: if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None tf.compat.v1.logging.info(x_train.shape) history = model.fit(x_train, y_train, batch_size=flags_obj.batch_size, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=(x_test, y_test), validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate((x_test, y_test), steps=num_eval_steps, verbose=2) stats = common.build_stats(history, eval_output, callbacks) return stats
if first_batch: from kungfu.tensorflow.initializer import broadcast_variables broadcast_variables(model.variables) broadcast_variables(opt.variables()) def log(s, nl=True): if current_rank() != 0: return print(s, end='\n' if nl else '') log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) device = 'GPU' if args.cuda else 'CPU' log('Number of %ss: %d' % (device, current_cluster_size())) with tf.device(device): # Warm-up log('Running warmup...') benchmark_step(first_batch=True) timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): # log('Unix epoch time before iter #{}: {}'.format(x, pytime.time())) time = timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_batches_per_iter)
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.float16: loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale=loss_scale) tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) if not keras_utils.is_v2_0(): raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.') elif dtype == tf.bfloat16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) preprocessing_seed = 12345 # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype, drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_preprocessing.input_fn # When `enable_xla` is True, we always drop the remainder of the batches # in the dataset, as XLA-GPU doesn't support dynamic shapes. drop_remainder = flags_obj.enable_xla train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, random_seed=preprocessing_seed, #addition num_workers=current_cluster_size(), #addition worker_ID=current_rank(), #addition tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, num_epochs=flags_obj.train_epochs, parse_record_fn=imagenet_preprocessing.parse_record, dtype=dtype, drop_remainder=drop_remainder) lr_schedule = 0.1 if flags_obj.use_tensor_lr: lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) # Build KungFu optimizer opt = common.get_optimizer(lr_schedule) # logging.info(opt.__dict__) optimizer = SynchronousSGDOptimizer(opt, reshape=False, use_locking=True) optimizer._hyper = opt._hyper # logging.info(optimizer.__dict__) if flags_obj.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # TODO(hongkuny): Remove trivial model usage and move it to benchmark. if flags_obj.use_trivial_model: model = trivial_model.trivial_model(imagenet_preprocessing.NUM_CLASSES) else: model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES) # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. metrics = (['sparse_categorical_accuracy']) metrics.append('sparse_top_k_categorical_accuracy') if flags_obj.force_v2_in_keras_compile is not None: model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=metrics, run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) else: model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=metrics, run_eagerly=flags_obj.run_eagerly) # adjust number of steps cluster_size = current_cluster_size() steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) steps_per_epoch = steps_per_epoch // cluster_size train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks(steps_per_epoch, current_rank(), cluster_size, common.learning_rate_schedule) # Broadcast variables for KungFu callbacks.append(BroadcastGlobalVariablesCallback()) # Checkpoint callback only on worker 0 if flags_obj.enable_checkpoint_and_export and current_rank() == 0: ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True)) if flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) # Checkpoint only on 0th worker if flags_obj.enable_checkpoint_and_export and current_rank() == 0: if dtype == tf.bfloat16: logging.warning( "Keras model.save does not support bfloat16 dtype.") else: # Keras model.save assumes a float32 input designature. export_path = os.path.join(flags_obj.model_dir, 'saved_model') model.save(export_path, include_optimizer=False) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = common.build_stats(history, eval_output, callbacks) return stats
def parallel_train(train_model,dataset,config): '''Parallel train pipeline of openpose class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor lr_decay_steps = [200000,300000,360000,420000,480000,540000,600000,700000,800000,900000] weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval=config.log.log_interval save_interval=config.train.save_interval vis_dir=config.train.vis_dir #model hyper params n_pos = train_model.n_pos hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout model_dir = config.model.model_dir pretrain_model_dir=config.pretrain.pretrain_model_dir pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz" #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer print(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}") #training dataset configure with shuffle,augmentation,and prefetch train_dataset=dataset.get_train_dataset() dataset_type=dataset.get_dataset_type() parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format flip_list=get_flip_list(dataset_type) paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,flip_list=flip_list,data_format=data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(64) #train model configure step=tf.Variable(1, trainable=False) lr=tf.Variable(lr_init,trainable=False) if(config.model.model_type==MODEL.Openpose): opt=tf.keras.optimizers.RMSprop(learning_rate=lr) else: opt=tf.keras.optimizers.Adam(learning_rate=lr) ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr) ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3) #load from ckpt try: log("loading ckpt...") ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") #load pretrained backbone try: log("loading pretrained backbone...") tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True) except: log("pretrained backbone doesn't exist, model backbone are initialized") #load model weights try: train_model.load_weights(os.path.join(model_dir,"newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") # KungFu configure kungfu_option=config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt=PairAveragingOptimizer(opt) n_step = n_step // current_cluster_size() + 1 # KungFu for step_idx,step in enumerate(lr_decay_steps): lr_decay_steps[step_idx] = step // current_cluster_size() + 1 # KungFu #optimize one step @tf.function def one_step(image,gt_label,mask,train_model,is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: gt_conf=gt_label[:,:n_pos,:,:] gt_paf=gt_label[:,n_pos:,:,:] pd_conf,pd_paf,stage_confs,stage_pafs=train_model.forward(image,is_train=True) pd_loss,loss_confs,loss_pafs=train_model.cal_loss(gt_conf,gt_paf,mask,stage_confs,stage_pafs) re_loss=regulize_loss(train_model,weight_decay_factor) total_loss=pd_loss+re_loss gradients=tape.gradient(total_loss,train_model.trainable_weights) opt.apply_gradients(zip(gradients,train_model.trainable_weights)) #Kung fu if(is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) return gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss #train each step tic=time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {}'.format( n_step, batch_size, lr_init, lr_decay_steps, lr_decay_factor)) for image,gt_label,mask in train_dataset: #learning rate decay if(step in lr_decay_steps): new_lr_decay = lr_decay_factor**(float(lr_decay_steps.index(step)+1)) lr=lr_init*new_lr_decay #optimize one step gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\ train_model,step==0) #save log info periodly if((step.numpy()!=0) and (step.numpy()%log_interval)==0): tic=time.time() log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}'.format( step.numpy(), n_step, total_loss, lr.numpy(), re_loss,time.time()-tic)) #save result and ckpt periodly if((step!=0) and (step%save_interval)==0 and current_rank()==0): log("saving model ckpt and result...") draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\ vis_dir,'train_%d_' % step) ckpt_save_path=ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path=os.path.join(model_dir,"newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if(step==n_step): break
y_test = keras.utils.to_categorical(y_test, num_classes) scores = model.evaluate(x_test, y_test, verbose=1) return scores def f_data(x_train, y_train): x_train = x_train.astype('float32') x_train /= 255 y_train = tf.keras.utils.to_categorical(y_train, num_classes) return x_train, y_train if __name__ == "__main__": logging.basicConfig(filename="tf2_Conv4_CIFAR10_exp_0.log", level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s") # Load data (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data() class_names = [ "airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck" ] # Pre process data x_train, y_train = preprocess_data.process(f_data, x_train, y_train) optimizer = build_optimizer('sync-sgd', n_workers=current_cluster_size()) model = build_model(optimizer, x_train, num_classes) train_model(model, args.name, x_train, x_test, y_train, y_test)
def train(parallel, kungfu_option): Gab = models.get_G(name='Gab') Gba = models.get_G(name='Gba') Da = models.get_D(name='Da') Db = models.get_D(name='Db') Gab.train() Gba.train() Da.train() Db.train() lr_v = tf.Variable(flags.lr_init) # optimizer_Gab_Db = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1) # optimizer_Gba_Da = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1) # optimizer_G = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1) # optimizer_D = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1) optimizer = tf.optimizers.Adam( lr_v, beta_1=flags.beta_1 ) # use only one optimier, if your GPU memory is large use_ident = False # KungFu: wrap the optimizers if parallel: from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer if kungfu_option == 'sync-sgd': opt_fn = SynchronousSGDOptimizer elif kungfu_option == 'async-sgd': opt_fn = PairAveragingOptimizer elif kungfu_option == 'sma': opt_fn = SynchronousAveragingOptimizer else: raise RuntimeError('Unknown distributed training optimizer.') optimizer_Gab_Db = opt_fn(optimizer_Gab_Db) optimizer_Gba_Da = opt_fn(optimizer_Gba_Da) # Gab.load_weights(flags.model_dir + '/Gab.h5') # restore params? # Gba.load_weights(flags.model_dir + '/Gba.h5') # Da.load_weights(flags.model_dir + '/Da.h5') # Db.load_weights(flags.model_dir + '/Db.h5') # KungFu: shard the data if parallel: from kungfu import current_cluster_size, current_rank data_A_shard = [] data_B_shard = [] for step, (image_A, image_B) in enumerate(zip(data_A, data_B)): if step % current_cluster_size() == current_rank(): data_A_shard.append(image_A) data_B_shard.append(image_B) else: data_A_shard = data_A data_B_shard = data_B @tf.function def train_step(image_A, image_B): fake_B = Gab(image_A) fake_A = Gba(image_B) cycle_A = Gba(fake_B) cycle_B = Gab(fake_A) if use_ident: iden_A = Gba(image_A) iden_B = Gab(image_B) logits_fake_B = Db(fake_B) # TODO: missing image buffer (pool) logits_real_B = Db(image_B) logits_fake_A = Da(fake_A) logits_real_A = Da(image_A) # loss_Da = (tl.cost.mean_squared_error(logits_real_A, tf.ones_like(logits_real_A), is_mean=True) + \ # LSGAN # tl.cost.mean_squared_error(logits_fake_A, tf.ones_like(logits_fake_A), is_mean=True)) / 2. loss_Da = tf.reduce_mean(tf.math.squared_difference(logits_fake_A, tf.zeros_like(logits_fake_A))) + \ tf.reduce_mean(tf.math.squared_difference(logits_real_A, tf.ones_like(logits_real_A))) # loss_Da = tl.cost.sigmoid_cross_entropy(logits_fake_A, tf.zeros_like(logits_fake_A)) + \ # tl.cost.sigmoid_cross_entropy(logits_real_A, tf.ones_like(logits_real_A)) # loss_Db = (tl.cost.mean_squared_error(logits_real_B, tf.ones_like(logits_real_B), is_mean=True) + \ # LSGAN # tl.cost.mean_squared_error(logits_fake_B, tf.ones_like(logits_fake_B), is_mean=True)) / 2. loss_Db = tf.reduce_mean(tf.math.squared_difference(logits_fake_B, tf.zeros_like(logits_fake_B))) + \ tf.reduce_mean(tf.math.squared_difference(logits_real_B, tf.ones_like(logits_real_B))) # loss_Db = tl.cost.sigmoid_cross_entropy(logits_fake_B, tf.zeros_like(logits_fake_B)) + \ # tl.cost.sigmoid_cross_entropy(logits_real_B, tf.ones_like(logits_real_B)) # loss_Gab = tl.cost.mean_squared_error(logits_fake_B, tf.ones_like(logits_fake_B), is_mean=True) # LSGAN loss_Gab = tf.reduce_mean( tf.math.squared_difference(logits_fake_B, tf.ones_like(logits_fake_B))) # loss_Gab = tl.cost.sigmoid_cross_entropy(logits_fake_B, tf.ones_like(logits_fake_B)) # loss_Gba = tl.cost.mean_squared_error(logits_fake_A, tf.ones_like(logits_fake_A), is_mean=True) # LSGAN loss_Gba = tf.reduce_mean( tf.math.squared_difference(logits_fake_A, tf.ones_like(logits_fake_A))) # loss_Gba = tl.cost.sigmoid_cross_entropy(logits_fake_A, tf.ones_like(logits_fake_A)) # loss_cyc = 10 * (tl.cost.absolute_difference_error(image_A, cycle_A, is_mean=True) + \ # tl.cost.absolute_difference_error(image_B, cycle_B, is_mean=True)) loss_cyc = 10. * (tf.reduce_mean(tf.abs(image_A - cycle_A)) + tf.reduce_mean(tf.abs(image_B - cycle_B))) if use_ident: loss_iden = 5. * (tf.reduce_mean(tf.abs(image_A - iden_A)) + tf.reduce_mean(tf.abs(image_B - iden_B))) else: loss_iden = 0. loss_G = loss_Gab + loss_Gba + loss_cyc + loss_iden loss_D = loss_Da + loss_Db return loss_G, loss_D, loss_Gab, loss_Gba, loss_cyc, loss_iden, loss_Da, loss_Db, loss_D + loss_G for epoch in range(0, flags.n_epoch): # reduce lr linearly after 100 epochs, from lr_init to 0 if epoch >= 100: new_lr = flags.lr_init - flags.lr_init * (epoch - 100) / 100 lr_v.assign(lr_v, new_lr) print("New learning rate %f" % new_lr) # train 1 epoch for step, (image_A, image_B) in enumerate(zip(data_A_shard, data_B_shard)): if image_A.shape[0] != flags.batch_size or image_B.shape[ 0] != flags.batch_size: # if the remaining data in this epoch < batch_size break step_time = time.time() with tf.GradientTape(persistent=True) as tape: # print(image_A.numpy().max()) loss_G, loss_D, loss_Gab, loss_Gba, loss_cyc, loss_iden, loss_Da, loss_Db, loss_DG = train_step( image_A, image_B) grad = tape.gradient( loss_DG, Gba.trainable_weights + Gab.trainable_weights + Da.trainable_weights + Db.trainable_weights) optimizer.apply_gradients( zip( grad, Gba.trainable_weights + Gab.trainable_weights + Da.trainable_weights + Db.trainable_weights)) # grad = tape.gradient(loss_G, Gba.trainable_weights+Gab.trainable_weights) # optimizer_G.apply_gradients(zip(grad, Gba.trainable_weights+Gab.trainable_weights)) # grad = tape.gradient(loss_D, Da.trainable_weights+Db.trainable_weights) # optimizer_D.apply_gradients(zip(grad, Da.trainable_weights+Db.trainable_weights)) # del tape print("Epoch[{}/{}] step[{}/{}] time:{:.3f} Gab:{:.3f} Gba:{:.3f} cyc:{:.3f} iden:{:.3f} Da:{:.3f} Db:{:.3f}".format(\ epoch, flags.n_epoch, step, n_step_per_epoch, time.time()-step_time, \ loss_Gab, loss_Gba, loss_cyc, loss_iden, loss_Da, loss_Db)) if parallel and step == 0: # KungFu: broadcast is done after the first gradient step to ensure optimizer initialization. from kungfu.tensorflow.initializer import broadcast_variables # Broadcast model variables broadcast_variables(Gab.trainable_weights) broadcast_variables(Gba.trainable_weights) broadcast_variables(Da.trainable_weights) broadcast_variables(Db.trainable_weights) # Broadcast optimizer variables broadcast_variables(optimizer_Gab.variables()) broadcast_variables(optimizer_Gba.variables()) broadcast_variables(optimizer_Da.variables()) broadcast_variables(optimizer_Db.variables()) if parallel: from kungfu import current_rank is_chief = current_rank() == 0 else: is_chief = True # Let the chief worker to do visuliazation and checkpoints. if is_chief: # visualization # outb = Gab(sample_A) # outa = Gba(sample_B) # tl.vis.save_images(outb.numpy(), [1, 5], flags.sample_dir+'/{}_a2b.png'.format(epoch)) # tl.vis.save_images(outa.numpy(), [1, 5], flags.sample_dir+'/{}_b2a.png'.format(epoch)) outb_list = [] # do it one by one in case your GPU memory is low for i in range(len(sample_A)): outb = Gab(sample_A[i][np.newaxis, :, :, :]) outb_list.append(outb.numpy()[0]) outa_list = [] for i in range(len(sample_B)): outa = Gba(sample_B[i][np.newaxis, :, :, :]) outa_list.append(outa.numpy()[0]) tl.vis.save_images(np.asarray(outb_list), [1, 5], flags.sample_dir + '/{}_a2b.png'.format(epoch)) tl.vis.save_images(np.asarray(outa_list), [1, 5], flags.sample_dir + '/{}_b2a.png'.format(epoch)) # save models if epoch % 5: Gab.save_weights(flags.model_dir + '/Gab.h5') Gba.save_weights(flags.model_dir + '/Gba.h5') Da.save_weights(flags.model_dir + '/Da.h5') Db.save_weights(flags.model_dir + '/Db.h5')
def parallel_train(training_dataset, kungfu_option): from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer ds = training_dataset.shuffle(buffer_size=4096) ds = ds.shard(num_shards=current_cluster_size(), index=current_rank()) ds = ds.repeat(n_epoch) ds = ds.map(_map_fn, num_parallel_calls=4) ds = ds.batch(batch_size) ds = ds.prefetch(buffer_size=1) iterator = ds.make_one_shot_iterator() one_element = iterator.get_next() net, total_loss, log_tensors = make_model(*one_element, is_train=True, reuse=False) x_ = net.img # net input last_conf = net.last_conf # net output last_paf = net.last_paf # net output confs_ = net.confs # GT pafs_ = net.pafs # GT mask = net.m1 # mask1, GT # net.m2 = m2 # mask2, GT stage_losses = net.stage_losses l2_loss = net.l2_loss global_step = tf.Variable(1, trainable=False) # scaled_lr = lr_init * current_cluster_size() # Horovod: scale the learning rate linearly scaled_lr = lr_init # Linear scaling rule is not working in openpose training. with tf.variable_scope('learning_rate'): lr_v = tf.Variable(scaled_lr, trainable=False) opt = tf.train.MomentumOptimizer(lr_v, 0.9) # KungFu if kungfu_option == 'sync-sgd': opt = SynchronousSGDOptimizer(opt) elif kungfu_option == 'async-sgd': opt = PairAveragingOptimizer(opt) elif kungfu_option == 'sma': opt = SynchronousAveragingOptimizer(opt) else: raise RuntimeError('Unknown distributed training optimizer.') train_op = opt.minimize(total_loss, global_step=global_step) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True # Add variable initializer. init = tf.global_variables_initializer() # KungFu from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp bcast = BroadcastGlobalVariablesOp() global n_step, lr_decay_every_step n_step = n_step // current_cluster_size() + 1 # KungFu lr_decay_every_step = lr_decay_every_step // current_cluster_size( ) + 1 # KungFu # Start training with tf.Session(config=config) as sess: init.run() bcast.run() # KungFu print('Worker{}: Initialized'.format(current_rank())) print( 'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}' .format(current_rank(), n_step, batch_size, lr_init, lr_decay_every_step)) # restore pre-trained weights try: # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net) tl.files.load_and_assign_npz_dict(sess=sess, name=os.path.join( model_path, 'pose.npz')) except: print("no pre-trained model") # train until the end while True: step = sess.run(global_step) if step == n_step: break tic = time.time() if step != 0 and (step % lr_decay_every_step == 0): new_lr_decay = lr_decay_factor**(step // lr_decay_every_step) sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay)) [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \ sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf]) # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time())) lr = sess.run(lr_v) print( 'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s' .format(current_rank(), step, n_step, _loss, lr, _l2, time.time() - tic)) for ix, ll in enumerate(_stage_losses): print('Worker{}:', current_rank(), 'Network#', ix, 'For Branch', ix % 2 + 1, 'Loss:', ll) # save intermediate results and model if current_rank() == 0: # KungFu if (step != 0) and (step % save_interval == 0): # save some results [ img_out, confs_ground, pafs_ground, conf_result, paf_result, mask_out ] = sess.run( [x_, confs_, pafs_, last_conf, last_paf, mask]) draw_results(img_out, confs_ground, conf_result, pafs_ground, paf_result, mask_out, 'train_%d_' % step) # save model # tl.files.save_npz( # net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess) # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose' + str(step) + '.npz'), sess=sess) tl.files.save_npz_dict(net.all_params, os.path.join( model_path, 'pose.npz'), sess=sess)
def parallel_train(train_model, dataset, config): '''Parallel train pipeline of PoseProposal class models input model and dataset, the train pipeline will start automaticly the train pipeline will: 1.store and restore ckpt in directory ./save_dir/model_name/model_dir 2.log loss information in directory ./save_dir/model_name/log.txt 3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz Parameters ---------- arg1 : tensorlayer.models.MODEL a preset or user defined model object, obtained by Model.get_model() function arg2 : dataset a constructed dataset object, obtained by Dataset.get_dataset() function Returns ------- None ''' init_log(config) #train hyper params #dataset params n_step = config.train.n_step batch_size = config.train.batch_size #learning rate params lr_init = config.train.lr_init lr_decay_factor = config.train.lr_decay_factor weight_decay_factor = config.train.weight_decay_factor #log and checkpoint params log_interval = config.log.log_interval save_interval = config.train.save_interval vis_dir = config.train.vis_dir #model hyper params hin = train_model.hin win = train_model.win hout = train_model.hout wout = train_model.wout hnei = train_model.hnei wnei = train_model.wnei model_dir = config.model.model_dir #import kungfu from kungfu import current_cluster_size, current_rank from kungfu.tensorflow.initializer import broadcast_variables from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer print( f"parallel training using learning rate:{lr_init} batch_size:{batch_size}" ) #training dataset configure with shuffle,augmentation,and prefetch train_dataset = dataset.get_train_dataset() parts, limbs, data_format = train_model.parts, train_model.limbs, train_model.data_format paramed_map_fn = get_paramed_map_fn(hin, win, hout, wout, hnei, wnei, parts, limbs, data_format) train_dataset = train_dataset.shuffle(buffer_size=4096) train_dataset = train_dataset.shard(num_shards=current_cluster_size(), index=current_rank()) train_dataset = train_dataset.repeat() train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4) train_dataset = train_dataset.batch(batch_size) train_dataset = train_dataset.prefetch(buffer_size=2) #train model configure step = tf.Variable(1, trainable=False) lr = tf.Variable(lr_init, trainable=False) opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9) ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr) ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3) #load from ckpt try: ckpt.restore(ckpt_manager.latest_checkpoint) except: log("ckpt_path doesn't exist, step and optimizer are initialized") try: train_model.load_weights(os.path.join(model_dir, "newest_model.npz")) except: log("model_path doesn't exist, model parameters are initialized") #Kungfu configure kungfu_option = config.train.kungfu_option if kungfu_option == KUNGFU.Sync_sgd: print("using Kungfu.SynchronousSGDOptimizer!") opt = SynchronousSGDOptimizer(opt) elif kungfu_option == KUNGFU.Sync_avg: print("using Kungfu.SynchronousAveragingOptimize!") opt = SynchronousAveragingOptimizer(opt) elif kungfu_option == KUNGFU.Pair_avg: print("using Kungfu.PairAveragingOptimizer!") opt = PairAveragingOptimizer(opt) n_step = n_step // current_cluster_size() + 1 # KungFu #optimize one step @tf.function def one_step(image, targets, train_model, is_first_batch=False): step.assign_add(1) with tf.GradientTape() as tape: delta, tx, ty, tw, th, te, te_mask = targets pc, pi, px, py, pw, ph, pe = train_model.forward(image, is_train=True) loss_rsp,loss_iou,loss_coor,loss_size,loss_limb=\ train_model.cal_loss(delta,tx,ty,tw,th,te,te_mask,pc,pi,px,py,pw,ph,pe) pd_loss = loss_rsp + loss_iou + loss_coor + loss_size + loss_limb re_loss = regulize_loss(train_model, weight_decay_factor) total_loss = pd_loss + re_loss gradients = tape.gradient(total_loss, train_model.trainable_weights) opt.apply_gradients(zip(gradients, train_model.trainable_weights)) #Kung fu if (is_first_batch): broadcast_variables(train_model.all_weights) broadcast_variables(opt.variables()) predicts = (pc, px, py, pw, ph, pe) return predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb #train each step tic = time.time() train_model.train() log(f"Worker {current_rank()}: Initialized") log(f'Start - n_step: {n_step} batch_size: {batch_size} lr_init: {lr_init} lr_decay_factor: {lr_decay_factor}' ) avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0. for image, targets in train_dataset: #learning rate decay lr = lr_init * (1 - step / n_step * lr_decay_factor) #optimize one step predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb = one_step( image, targets, train_model) avg_loss_rsp += loss_rsp / log_interval avg_loss_iou += loss_iou / log_interval avg_loss_coor += loss_coor / log_interval avg_loss_size += loss_size / log_interval avg_loss_limb += loss_limb / log_interval avg_pd_loss += pd_loss / log_interval avg_re_loss += re_loss / log_interval #save log info periodly if ((step != 0) and (step % log_interval) == 0): tic = time.time() log(f"worker:{current_rank()} Train iteration {step.numpy()}/{n_step}, learning rate:{lr.numpy()},"+\ f"loss_rsp:{avg_loss_rsp},loss_iou:{avg_loss_iou},loss_coor:{avg_loss_coor},loss_size:{avg_loss_size},"+\ f"loss_limb:{avg_loss_limb},loss_pd:{avg_pd_loss},loss_re:{avg_re_loss} ,time:{time.time()-tic}") avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0. #save result and ckpt periodly if ((step != 0) and (step % save_interval) == 0): log("saving model ckpt and result...") draw_results(image.numpy(), predicts, targets, parts, limbs, save_dir=vis_dir, name=f"ppn_step_{step.numpy()}") ckpt_save_path = ckpt_manager.save() log(f"ckpt save_path:{ckpt_save_path} saved!\n") model_save_path = os.path.join(model_dir, "newest_model.npz") train_model.save_weights(model_save_path) log(f"model save_path:{model_save_path} saved!\n") #training finished if (step == n_step): break