예제 #1
0
def process_data(data_dir, sharding=True):
    ds = tf.data.Dataset.list_files(str(data_dir + "/*/*"))

    if sharding:
        ds = ds.shard(num_shards = current_cluster_size(),
                index=current_rank())

    ds = ds.map(process_path, num_parallel_calls=AUTOTUNE)

    ds = ds.batch(BATCH_SIZE_TRAINING)
    return ds
예제 #2
0
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KUNGFU: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KUNGFU: KungFu initilizer defines how model weights are initilised on distributed devices
    if hasattr(optimizer, 'distributed_initializer'):
        sess.run(optimizer.distributed_initializer())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KungFu: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KungFu: broadcast the global variable
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    sess.run(BroadcastGlobalVariablesOp())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
예제 #4
0
def test_group_all_gather():
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import all_gather
    rank = current_rank()
    np = current_cluster_size()
    sizes = [i + 1 for i in range(5)]
    xs = [(rank + 1) * tf.Variable(tf.ones([n], tf.int32)) for n in sizes]
    ys = [all_gather(x) for x in xs]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i, y in enumerate(ys):
            v = sess.run(y)
            assert (v.sum() == (np + 1) * np / 2 * (i + 1))
예제 #5
0
def test_consensus():
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import consensus

    np = current_cluster_size()
    rank = current_rank()

    x = tf.Variable(rank, dtype=tf.int32)
    consensus_check = consensus(x)

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(consensus_check)

        assert v == (np == 1)
예제 #6
0
def build_optimizer():
    # KungFu: adjust learning rate based on number of GPUs.
    # opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
    opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

    # KungFu: wrap tf.compat.v1.train.Optimizer.
    if args.kf_optimizer == 'sync-sgd':
        opt = SynchronousSGDOptimizer(opt)
    elif args.kf_optimizer == 'async-sgd':
        opt = PairAveragingOptimizer(opt)
    elif args.kf_optimizer == 'sma':
        opt = SynchronousAveragingOptimizer(opt)
    else:
        raise RuntimeError('Unknown KungFu optimizer')

    return opt
예제 #7
0
def train_model(model, dataset, n_epochs=1, batch_size=5000):
    n_shards = current_cluster_size()
    shard_id = current_rank()
    train_data_size = len(dataset['x_train'])

    # calculate the offset for the data of the KungFu node
    shard_size = train_data_size // n_shards
    offset = batch_size * shard_id

    # extract the data for learning of the KungFu node
    x = dataset['x_train'][offset:offset + shard_size]
    y = dataset['y_train'][offset:offset + shard_size]
    # train the model
    model.fit(x,
              y,
              batch_size=batch_size,
              epochs=n_epochs,
              validation_data=(dataset['x_val'], dataset['y_val']),
              verbose=2)
def load_data_per_node(x_train, y_train, dataset_size, backup_worker_id,
                       backup_frac):

    # custom size of the training data
    x_train, y_train = x_train[:DATASET_SIZE], y_train[:DATASET_SIZE]

    # shard the dataset for KungFu node
    n_shards = current_cluster_size()
    shard_id = current_rank()
    train_data_size = len(x_train)

    shard_size = train_data_size // n_shards
    offset = shard_size * shard_id

    # extract the data for learning of the KungFu primary nodes
    x_node = x_train[offset:offset + shard_size]
    y_node = y_train[offset:offset + shard_size]
    num_images_backup = int(train_data_size * backup_frac)

    # extract the data for learning of the KungFu backup nodes
    frac_data_per_worker = 1 / n_shards
    repeat_nums = frac_data_per_worker // backup_frac
    remainder = int(
        round(train_data_size *
              (frac_data_per_worker - backup_frac * repeat_nums)))
    print("info : ", frac_data_per_worker, repeat_nums,
          backup_frac * repeat_nums, remainder, train_data_size)

    if shard_id == backup_worker_id:
        x_distinct = x_train[offset:offset + shard_size][0:num_images_backup]
        y_distinct = y_train[offset:offset + shard_size][0:num_images_backup]
        x_repeat = x_distinct.repeat(repeat_nums, axis=0)
        y_repeat = y_distinct.repeat(repeat_nums, axis=0)
        x_node = np.concatenate((x_repeat, x_distinct[0:remainder]), axis=0)
        y_node = np.concatenate((y_repeat, y_distinct[0:remainder]), axis=0)

    print("Worker ID {} | start idx {} | end idx {} ".format(
        shard_id, offset, offset + shard_size))
    print("Training set size:", x_node.shape, y_node.shape)

    return x_node, y_node
예제 #9
0
def test_set_tree(steps, warmup_steps=10):
    from kungfu import current_cluster_size
    from kungfu.tensorflow.ops import all_reduce, broadcast
    from kungfu.tensorflow.ops.adapt import set_tree

    n = current_cluster_size()

    tree_place = tf.placeholder(dtype=tf.int32, shape=(n, ))
    set_tree_op = set_tree(broadcast(tree_place))

    magic = 32
    x = tf.Variable(list(range(magic)), dtype=tf.int32)
    y = all_reduce(x)

    init = tf.global_variables_initializer()

    durations = []
    with tf.Session() as sess:
        sess.run(init)
        from kungfu._utils import one_based_range
        for step in one_based_range(steps + warmup_steps):
            v = sess.run(y)
            assert (v.sum() == n * magic * (magic - 1) / 2)
            # print(v)

            tree = gen_tree(n)
            t0 = time.time()
            sess.run(set_tree_op, feed_dict={
                tree_place: tree,
            })
            dur = time.time() - t0

            if step > warmup_steps:
                durations.append(dur)

    ds = np.array([d * 1000 for d in durations])
    from kungfu._utils import show_duration
    print(
        'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)'
        % (len(ds), n, ds.mean(), ds.min(), ds.max()))
def build_optimizer(name, batch_size):
    learning_rate = 0.1

    # Scale learning rate according to the level of data parallelism
    optimizer = tf.train.GradientDescentOptimizer(learning_rate *
                                                  current_cluster_size())

    # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers.
    if name == 'sync-sgd':
        from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer
        return SynchronousSGDOptimizer(optimizer)
    elif name == 'async-sgd':
        from kungfu.tensorflow.optimizers import PairAveragingOptimizer
        return PairAveragingOptimizer(optimizer)
    elif name == 'sma':
        from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer
        return SynchronousAveragingOptimizer(optimizer)
    elif name == 'ada-sgd':
        from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer
        return AdaptiveSGDOptimizer(optimizer, change_step=300)
    else:
        raise RuntimeError('unknown optimizer: %s' % name)
예제 #11
0
mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])
loss = tf.losses.SparseCategoricalCrossentropy()

# KungFu: adjust learning rate based on number of GPUs.
# opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

# KungFu: wrap tf.compat.v1.train.Optimizer.
if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt)
else:
    raise RuntimeError('Unknown KungFu optimizer')


@tf.function
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
예제 #12
0
def test_peer_info():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
parser = argparse.ArgumentParser(description='Keras MNIST example.')
parser.add_argument('--kf-optimizer',
                    type=str,
                    default='sync-sgd',
                    help='kungfu optimizer')
args = parser.parse_args()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
K.set_session(tf.Session(config=config))

batch_size = 128
num_classes = 10

# KungFu: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(4.0 / current_cluster_size()))

# Input image dimensions
img_rows, img_cols = 28, 28

# The data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)
예제 #14
0
#    cb_checkpointer = ModelCheckpoint(
#        filepath='../working/best.hdf5', monitor='val_loss', save_best_only=True, mode='auto')

    # Accumulate history of all permutations (may be for viewing trend) and keep watching for lowest val_loss as final model

    model.fit(
        train_dataset,
        epochs=epochs,
        validation_data=val_dataset,
        callbacks=[BroadcastGlobalVariablesCallback()]
    )
#    model.load_weights("../working/best.hdf5")


if __name__ == "__main__":
    n_shards = current_cluster_size()
    model = build_model(n_shards)
    train_dataset = process_data(TRAIN_DIR)
    val_dataset = process_data(VAL_DIR, sharding=False)
    train_model(model, train_dataset, val_dataset, epochs=NUM_EPOCHS)


"""### Keras Limitations

* [10/02/2018] The *validation_split* is not supported in *fit_generator*, hence its expects ImageDataGenerator for pre-splitted train & valid.
* [10/02/2018] Model learning through *fit_generator* is not compatible for Sklearn *GridSearchCV* again *mostly* due to no support for *validation_split*.

### Followup Plan

1. Scale and pad and avoid aspect ratio change of original image through Keras ImageDataGenerator pre-processing insfrastructure
2. Image augmentation
예제 #15
0
parser.add_argument('--kf-optimizer',
                    type=str,
                    default='sync-sgd',
                    help='available options: sync-sgd, async-sgd, sma')
parser.add_argument('--name',
                    type=str,
                    required=True,
                    help='name this experiement run for Tensorboard logging')
args = parser.parse_args()

DATASET_SIZE = 300
TRAIN_VAL_SPLIT = 0.8
NUM_EPOCHS = 15
BATCH_SIZE = 8
# adjust number of steps based on number of workers
NUM_STEPS = (DATASET_SIZE // BATCH_SIZE) // current_cluster_size()


def load_data():

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % current_rank())
    print(len(mnist_images))

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))

    # smaller dataset for quick testing
    smaller_dataset = dataset.take(DATASET_SIZE)
    split = int(DATASET_SIZE * TRAIN_VAL_SPLIT)
예제 #16
0
def parallel_train(train_model,dataset,config):
    '''Parallel train pipeline of openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = config.train.lr_decay_steps
    warm_up_step=8000
    warm_up_decay=0.01
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval=config.log.log_interval
    save_interval=config.train.save_interval
    vis_dir=config.train.vis_dir
    
    #model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    model_dir = config.model.model_dir
    pretrain_model_dir=config.pretrain.pretrain_model_dir
    pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    log(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}")
    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset=dataset.get_train_dataset()
    dataset_type=dataset.get_dataset_type()
    parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format
    paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn,num_parallel_calls=max(multiprocessing.cpu_count()//2,1))
    train_dataset = train_dataset.batch(batch_size)  
    train_dataset = train_dataset.prefetch(64)
    
    #train configure
    step=tf.Variable(1, trainable=False)
    lr=tf.Variable(lr_init,trainable=False)
    lr_init=tf.Variable(lr_init,trainable=False)
    opt=tf.optimizers.Adam(learning_rate=lr)
    ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr)
    ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3)
    
    #load from ckpt
    log("loading ckpt...")
    try:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        log("ckpt loaded successfully!")
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")

    #load pretrained backbone
    log("loading pretrained backbone...")
    try:
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True)
        log("pretrained backbone loaded successfully")
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized")

    #load model weights
    log("loading saved training model weights...")
    try:
        train_model.load_weights(os.path.join(model_dir,"newest_model.npz"))
        log("saved training model weights loaded successfully")
    except:
        log("model_path doesn't exist, model parameters are initialized")
    
    # KungFu configure
    kungfu_option=config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt=PairAveragingOptimizer(opt)
    
    # KungFu adjust
    n_step = n_step // current_cluster_size() + 1  # KungFu
    for step_idx,step in enumerate(lr_decay_steps):
        lr_decay_steps[step_idx] = step // current_cluster_size() + 1  # KungFu

    for lr_decay_step in lr_decay_steps:
        if(step>lr_decay_step):
            lr=lr*lr_decay_factor
    
    #optimize one step
    @tf.function
    def one_step(image,gt_label,mask,train_model,is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            gt_pif_maps,gt_paf_maps=gt_label
            pd_pif_maps,pd_paf_maps=train_model.forward(image,is_train=True)
            loss_pif_maps,loss_paf_maps,total_loss=train_model.cal_loss(pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps)
            decay_loss=regulize_loss(train_model,weight_decay_factor)
            total_loss+=decay_loss
        
        gradients=tape.gradient(total_loss,train_model.trainable_weights)
        opt.apply_gradients(zip(gradients,train_model.trainable_weights))
        #Kung fu
        if(is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        return pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss

    #train each step
    train_model.train()
    tic=time.time()
    avg_time=AvgMetric(name="time_iter",metric_interval=log_interval)
    #total loss metrics
    avg_total_loss=AvgMetric(name="total_loss",metric_interval=log_interval)
    #decay loss metrics
    avg_decay_loss=AvgMetric(name="decay_loss",metric_interval=log_interval)
    #pif loss metrics
    avg_pif_conf_loss=AvgMetric(name="pif_conf_loss",metric_interval=log_interval)
    avg_pif_vec_loss=AvgMetric(name="pif_vec_loss",metric_interval=log_interval)
    avg_pif_scale_loss=AvgMetric(name="pif_scale_loss",metric_interval=log_interval)
    #paf loss metrics
    avg_paf_conf_loss=AvgMetric(name="paf_conf_loss",metric_interval=log_interval)
    avg_paf_src_vec_loss=AvgMetric(name="paf_src_vec_loss",metric_interval=log_interval)
    avg_paf_dst_vec_loss=AvgMetric(name="paf_dst_vec_loss",metric_interval=log_interval)
    avg_paf_src_scale_loss=AvgMetric(name="paf_src_scale_loss",metric_interval=log_interval)
    avg_paf_dst_scale_loss=AvgMetric(name="paf_dst_scale_loss",metric_interval=log_interval)
    log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {} weight_decay_factor: {}'.format(
            n_step, batch_size, lr_init.numpy(), lr_decay_steps, lr_decay_factor, weight_decay_factor))
    for image,gt_label,mask,labeled in train_dataset:
        #get losses
        pd_pif_maps,pd_paf_maps,loss_pif_maps,loss_paf_maps,decay_loss,total_loss=one_step(image,gt_label,mask,train_model,step==0)
        loss_pif_conf,loss_pif_vec,loss_pif_scale=loss_pif_maps
        loss_paf_conf,loss_paf_src_vec,loss_paf_dst_vec,loss_paf_src_scale,loss_paf_dst_scale=loss_paf_maps
        #update metrics
        avg_time.update(time.time()-tic)
        tic=time.time()
        #update total losses
        avg_total_loss.update(total_loss)
        #update decay loss
        avg_decay_loss.update(decay_loss)
        #update pif_losses metrics
        avg_pif_conf_loss.update(loss_pif_conf)
        avg_pif_vec_loss.update(loss_pif_vec)
        avg_pif_scale_loss.update(loss_pif_scale)
        #update paf_losses metrics
        avg_paf_conf_loss.update(loss_paf_conf)
        avg_paf_src_vec_loss.update(loss_paf_src_vec)
        avg_paf_dst_vec_loss.update(loss_paf_dst_vec)
        avg_paf_src_scale_loss.update(loss_paf_src_scale)
        avg_paf_dst_scale_loss.update(loss_paf_dst_scale)

        #learning rate decay
        if(step in lr_decay_steps):
            new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step)+1) 
            lr=lr_init*new_lr_decay
        #warm_up learning rate decay
        if(step <= warm_up_step):
            lr=lr_init*warm_up_decay**(1.0-step/warm_up_step)

        #save log info periodly
        if((step.numpy()!=0) and (step.numpy()%log_interval)==0):
            log(f"Train iteration {n_step} / {step.numpy()}, Learning rate:{lr.numpy()} {avg_total_loss.get_metric()} "+\
                f"{avg_pif_conf_loss.get_metric()} {avg_pif_vec_loss.get_metric()} {avg_pif_scale_loss.get_metric()}"+\
                f"{avg_paf_conf_loss.get_metric()} {avg_paf_src_vec_loss.get_metric()} {avg_paf_dst_vec_loss.get_metric()}"+\
                f"{avg_paf_src_scale_loss.get_metric()} {avg_paf_dst_scale_loss.get_metric()} {avg_decay_loss.get_metric()} {avg_time.get_metric()}")

        #save result and ckpt periodly
        if((step.numpy()!=0) and (step.numpy()%save_interval)==0):
            #save ckpt
            log("saving model ckpt and result...")
            ckpt_save_path=ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            #save train model
            model_save_path=os.path.join(model_dir,"newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")
            #draw result
            stride=train_model.stride
            gt_pif_maps,gt_paf_maps=gt_label
            draw_result(image,pd_pif_maps,pd_paf_maps,gt_pif_maps,gt_paf_maps,mask,parts,limbs,stride,save_dir=vis_dir,\
                name=f"train_{step.numpy()}")

        #training finished
        if(step==n_step):
            break
예제 #17
0
def parallel_train(train_model, dataset, config):
    '''Parallel train pipeline of openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''
    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_every_step = config.train.lr_decay_every_step
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval = config.log.log_interval
    save_interval = config.train.save_interval
    vis_dir = config.train.vis_dir

    #model hyper params
    n_pos = train_model.n_pos
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    data_format = train_model.data_format
    model_dir = config.model.model_dir

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables

    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset = dataset.get_train_dataset()
    dataset_type = dataset.get_dataset_type()
    parts, limbs, kpt_cvter = get_parts(dataset_type), get_limbs(
        dataset_type), get_input_kptcvter(dataset_type)
    flip_list = get_flip_list(dataset_type)
    paramed_map_fn = get_paramed_map_fn(hin,
                                        win,
                                        hout,
                                        wout,
                                        parts,
                                        limbs,
                                        kpt_cvter,
                                        flip_list=flip_list,
                                        data_format=dataset_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),
                                        index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4)
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.prefetch(64)

    #train model configure
    step = tf.Variable(1, trainable=False)
    lr = tf.Variable(lr_init, trainable=False)
    opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9)
    ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr)
    ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3)

    #load from ckpt
    try:
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    try:
        train_model.load_weights(os.path.join(model_dir, "newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")

    # KungFu configure
    opt = get_kungfu_opt(config.train.kungfu_option, opt)
    n_step = n_step // current_cluster_size() + 1  # KungFu
    lr_decay_every_step = lr_decay_every_step // current_cluster_size(
    ) + 1  # KungFu

    #optimize one step
    @tf.function
    def one_step(image, gt_label, mask, train_model, is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            gt_conf = gt_label[:, :n_pos, :, :, ]
            gt_paf = gt_label[:, n_pos:, :, :]
            pd_conf, pd_paf, stage_confs, stage_pafs = train_model.forward(
                image, is_train=True)

            pd_loss = train_model.cal_loss(gt_conf, gt_paf, mask, stage_confs,
                                           stage_pafs)
            re_loss = regulize_loss(train_model, weight_decay_factor)
            total_loss = pd_loss + re_loss

        gradients = tape.gradient(total_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(gradients, train_model.trainable_weights))
        #Kung fu
        if (is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        return gt_conf, gt_paf, pd_conf, pd_paf, total_loss, re_loss

    #train each step
    tic = time.time()
    train_model.train()
    log(f"Worker {current_rank()}: Initialized")
    log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}'
        .format(n_step, batch_size, lr_init, lr_decay_every_step))
    for image, gt_label, mask in train_dataset:
        #learning rate decay
        if (step % lr_decay_every_step == 0):
            new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
            lr = lr_init * new_lr_decay
        #optimize one step
        gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\
            train_model,step==0)
        #save log info periodly
        if ((step != 0) and (step % log_interval) == 0):
            tic = time.time()
            log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}'
                .format(step.numpy(), n_step, total_loss, lr.numpy(), re_loss,
                        time.time() - tic))

        #save result and ckpt periodly
        if ((step != 0) and (step % save_interval) == 0
                and current_rank() == 0):
            log("saving model ckpt and result...")
            draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\
                 vis_dir,'train_%d_' % step)
            ckpt_save_path = ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            model_save_path = os.path.join(model_dir, "newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")

        #training finished
        if (step == n_step):
            break
             tf.float32), tf.cast(y_train, tf.int64)))
train_dataset = train_dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])

# KungFu: adjust learning rate based on number of GPUs.
opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
# opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt)
else:
    raise RuntimeError('Unknown KungFu optimizer')

mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
                    optimizer=opt,
                    metrics=['accuracy'])
예제 #19
0
def run(flags_obj):
    """Run ResNet Cifar-10 training and eval loop using native Keras APIs.

  Args:
    flags_obj: An object containing parsed flag values.

  Raises:
    ValueError: If fp16 is passed as it is not currently supported.

  Returns:
    Dictionary of training and eval stats.
  """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    # Execute flag override logic for better model performance
    if flags_obj.tf_gpu_thread_mode:
        keras_utils.set_gpu_thread_mode_and_count(
            per_gpu_thread_count=flags_obj.per_gpu_thread_count,
            gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
            num_gpus=flags_obj.num_gpus,
            datasets_num_private_threads=flags_obj.datasets_num_private_threads
        )
    common.set_cudnn_batchnorm_mode()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == 'fp16':
        raise ValueError(
            'dtype fp16 is not supported in Keras. Use the default '
            'value(fp32).')

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=flags_obj.distribution_strategy,
        num_gpus=flags_obj.num_gpus,
        num_workers=distribution_utils.configure_cluster(),
        all_reduce_alg=flags_obj.all_reduce_alg,
        num_packs=flags_obj.num_packs)

    if strategy:
        # flags_obj.enable_get_next_as_optional controls whether enabling
        # get_next_as_optional behavior in DistributedIterator. If true, last
        # partial batch can be supported.
        strategy.extended.experimental_enable_get_next_as_optional = (
            flags_obj.enable_get_next_as_optional)

    strategy_scope = distribution_utils.get_strategy_scope(strategy)

    if flags_obj.use_synthetic_data:
        distribution_utils.set_up_synthetic_data()
        input_fn = common.get_synth_input_fn(
            height=cifar_preprocessing.HEIGHT,
            width=cifar_preprocessing.WIDTH,
            num_channels=cifar_preprocessing.NUM_CHANNELS,
            num_classes=cifar_preprocessing.NUM_CLASSES,
            dtype=flags_core.get_tf_dtype(flags_obj),
            drop_remainder=True)
    else:
        distribution_utils.undo_set_up_synthetic_data()
        input_fn = cifar_preprocessing.input_fn

    #train_input_dataset = input_fn(
    #    is_training=True,
    #    data_dir=flags_obj.data_dir,
    #    batch_size=flags_obj.batch_size,
    #    num_epochs=flags_obj.train_epochs,
    #    parse_record_fn=cifar_preprocessing.parse_record,
    #    datasets_num_private_threads=flags_obj.datasets_num_private_threads,
    #    dtype=dtype,
    #    # Setting drop_remainder to avoid the partial batch logic in normalization
    #    # layer, which triggers tf.where and leads to extra memory copy of input
    #    # sizes between host and GPU.
    #    drop_remainder=(not flags_obj.enable_get_next_as_optional))

    # eval_input_dataset = None
    # if not flags_obj.skip_eval:
    #   eval_input_dataset = input_fn(
    #       is_training=False,
    #       data_dir=flags_obj.data_dir,
    #       batch_size=flags_obj.batch_size,
    #       num_epochs=flags_obj.train_epochs,
    #       parse_record_fn=cifar_preprocessing.parse_record)

    (x_train, y_train), (x_test,
                         y_test) = tf.keras.datasets.cifar10.load_data()
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255
    y_train = tf.keras.utils.to_categorical(y_train, num_classes)
    y_test = tf.keras.utils.to_categorical(y_test, num_classes)

    # optimizer = common.get_optimizer()

    opt = tf.keras.optimizers.SGD(learning_rate=0.1)

    logging.info(opt.__dict__)
    optimizer = SynchronousSGDOptimizer(opt, use_locking=True)
    optimizer._hyper = opt._hyper

    logging.info(optimizer.__dict__)

    model = Conv4_model(x_train, num_classes)

    # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
    # a valid arg for this model. Also remove as a valid flag.
    if flags_obj.force_v2_in_keras_compile is not None:
        model.compile(
            loss='categorical_crossentropy',
            optimizer=optimizer,
            metrics=(['accuracy']),
            run_eagerly=flags_obj.run_eagerly,
            experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)
    else:
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=(['accuracy']),
                      run_eagerly=flags_obj.run_eagerly)

    cluster_size = current_cluster_size()
    steps_per_epoch = (cifar_preprocessing.NUM_IMAGES['train'] //
                       flags_obj.batch_size)
    steps_per_epoch = steps_per_epoch // cluster_size
    train_epochs = flags_obj.train_epochs

    callbacks = common.get_callbacks(steps_per_epoch, current_rank(),
                                     cluster_size, learning_rate_schedule)
    callbacks.append(BroadcastGlobalVariablesCallback())

    if flags_obj.train_steps:
        steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)

    num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    # validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        if flags_obj.set_learning_phase_to_train:
            # TODO(haoyuzhang): Understand slowdown of setting learning phase when
            # not using distribution strategy.
            tf.keras.backend.set_learning_phase(1)
        num_eval_steps = None
        validation_data = None

    tf.compat.v1.logging.info(x_train.shape)
    history = model.fit(x_train,
                        y_train,
                        batch_size=flags_obj.batch_size,
                        epochs=train_epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        validation_steps=num_eval_steps,
                        validation_data=(x_test, y_test),
                        validation_freq=flags_obj.epochs_between_evals,
                        verbose=2)
    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate((x_test, y_test),
                                     steps=num_eval_steps,
                                     verbose=2)
    stats = common.build_stats(history, eval_output, callbacks)
    return stats
예제 #20
0
    if first_batch:
        from kungfu.tensorflow.initializer import broadcast_variables
        broadcast_variables(model.variables)
        broadcast_variables(opt.variables())


def log(s, nl=True):
    if current_rank() != 0:
        return
    print(s, end='\n' if nl else '')

log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, current_cluster_size()))

with tf.device(device):
    # Warm-up
    log('Running warmup...')
    benchmark_step(first_batch=True)
    timeit.timeit(lambda: benchmark_step(first_batch=False),
                  number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        # log('Unix epoch time before iter #{}: {}'.format(x, pytime.time()))
        time = timeit.timeit(lambda: benchmark_step(first_batch=False),
                             number=args.num_batches_per_iter)
예제 #21
0
def run(flags_obj):
    """Run ResNet ImageNet training and eval loop using native Keras APIs.

    Args:
        flags_obj: An object containing parsed flag values.

    Raises:
        ValueError: If fp16 is passed as it is not currently supported.

    Returns:
        Dictionary of training and eval stats.
    """
    keras_utils.set_session_config(enable_eager=flags_obj.enable_eager,
                                   enable_xla=flags_obj.enable_xla)

    # Execute flag override logic for better model performance
    if flags_obj.tf_gpu_thread_mode:
        keras_utils.set_gpu_thread_mode_and_count(
            per_gpu_thread_count=flags_obj.per_gpu_thread_count,
            gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
            num_gpus=flags_obj.num_gpus,
            datasets_num_private_threads=flags_obj.datasets_num_private_threads
        )
    common.set_cudnn_batchnorm_mode()

    dtype = flags_core.get_tf_dtype(flags_obj)
    if dtype == tf.float16:
        loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale=loss_scale)
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)
        if not keras_utils.is_v2_0():
            raise ValueError('--dtype=fp16 is not supported in TensorFlow 1.')
    elif dtype == tf.bfloat16:
        policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
            'mixed_bfloat16')
        tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy)

    data_format = flags_obj.data_format
    if data_format is None:
        data_format = ('channels_first'
                       if tf.test.is_built_with_cuda() else 'channels_last')
    tf.keras.backend.set_image_data_format(data_format)

    preprocessing_seed = 12345

    # pylint: disable=protected-access
    if flags_obj.use_synthetic_data:
        distribution_utils.set_up_synthetic_data()
        input_fn = common.get_synth_input_fn(
            height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
            width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
            num_channels=imagenet_preprocessing.NUM_CHANNELS,
            num_classes=imagenet_preprocessing.NUM_CLASSES,
            dtype=dtype,
            drop_remainder=True)
    else:
        distribution_utils.undo_set_up_synthetic_data()
        input_fn = imagenet_preprocessing.input_fn

    # When `enable_xla` is True, we always drop the remainder of the batches
    # in the dataset, as XLA-GPU doesn't support dynamic shapes.
    drop_remainder = flags_obj.enable_xla

    train_input_dataset = input_fn(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=flags_obj.batch_size,
        num_epochs=flags_obj.train_epochs,
        parse_record_fn=imagenet_preprocessing.parse_record,
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        dtype=dtype,
        drop_remainder=drop_remainder,
        random_seed=preprocessing_seed,  #addition
        num_workers=current_cluster_size(),  #addition
        worker_ID=current_rank(),  #addition
        tf_data_experimental_slack=flags_obj.tf_data_experimental_slack,
        training_dataset_cache=flags_obj.training_dataset_cache,
    )

    eval_input_dataset = None
    if not flags_obj.skip_eval:
        eval_input_dataset = input_fn(
            is_training=False,
            data_dir=flags_obj.data_dir,
            batch_size=flags_obj.batch_size,
            num_epochs=flags_obj.train_epochs,
            parse_record_fn=imagenet_preprocessing.parse_record,
            dtype=dtype,
            drop_remainder=drop_remainder)

    lr_schedule = 0.1
    if flags_obj.use_tensor_lr:
        lr_schedule = common.PiecewiseConstantDecayWithWarmup(
            batch_size=flags_obj.batch_size,
            epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
            warmup_epochs=common.LR_SCHEDULE[0][1],
            boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
            multipliers=list(p[0] for p in common.LR_SCHEDULE),
            compute_lr_on_cpu=True)

    # Build KungFu optimizer
    opt = common.get_optimizer(lr_schedule)
    # logging.info(opt.__dict__)
    optimizer = SynchronousSGDOptimizer(opt, reshape=False, use_locking=True)
    optimizer._hyper = opt._hyper
    # logging.info(optimizer.__dict__)

    if flags_obj.fp16_implementation == 'graph_rewrite':
        # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
        # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
        # which will ensure tf.compat.v2.keras.mixed_precision and
        # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
        # up.
        optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer)

    # TODO(hongkuny): Remove trivial model usage and move it to benchmark.
    if flags_obj.use_trivial_model:
        model = trivial_model.trivial_model(imagenet_preprocessing.NUM_CLASSES)
    else:
        model = resnet_model.resnet50(
            num_classes=imagenet_preprocessing.NUM_CLASSES)

    # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
    # a valid arg for this model. Also remove as a valid flag.

    metrics = (['sparse_categorical_accuracy'])
    metrics.append('sparse_top_k_categorical_accuracy')

    if flags_obj.force_v2_in_keras_compile is not None:
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=optimizer,
            metrics=metrics,
            run_eagerly=flags_obj.run_eagerly,
            experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)
    else:
        model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=metrics,
                      run_eagerly=flags_obj.run_eagerly)

    # adjust number of steps
    cluster_size = current_cluster_size()
    steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] //
                       flags_obj.batch_size)
    steps_per_epoch = steps_per_epoch // cluster_size

    train_epochs = flags_obj.train_epochs
    callbacks = common.get_callbacks(steps_per_epoch, current_rank(),
                                     cluster_size,
                                     common.learning_rate_schedule)

    # Broadcast variables for KungFu
    callbacks.append(BroadcastGlobalVariablesCallback())

    # Checkpoint callback only on worker 0
    if flags_obj.enable_checkpoint_and_export and current_rank() == 0:
        ckpt_full_path = os.path.join(flags_obj.model_dir,
                                      'model.ckpt-{epoch:04d}')
        callbacks.append(
            tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
                                               save_weights_only=True))

    if flags_obj.train_steps:
        steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)

    num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] //
                      flags_obj.batch_size)

    validation_data = eval_input_dataset
    if flags_obj.skip_eval:
        # Only build the training graph. This reduces memory usage introduced by
        # control flow ops in layers that have different implementations for
        # training and inference (e.g., batch norm).
        if flags_obj.set_learning_phase_to_train:
            # TODO(haoyuzhang): Understand slowdown of setting learning phase when
            # not using distribution strategy.
            tf.keras.backend.set_learning_phase(1)
            num_eval_steps = None
            validation_data = None

    history = model.fit(train_input_dataset,
                        epochs=train_epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        validation_steps=num_eval_steps,
                        validation_data=validation_data,
                        validation_freq=flags_obj.epochs_between_evals,
                        verbose=2)

    # Checkpoint only on 0th worker
    if flags_obj.enable_checkpoint_and_export and current_rank() == 0:
        if dtype == tf.bfloat16:
            logging.warning(
                "Keras model.save does not support bfloat16 dtype.")
        else:
            # Keras model.save assumes a float32 input designature.
            export_path = os.path.join(flags_obj.model_dir, 'saved_model')
            model.save(export_path, include_optimizer=False)

    eval_output = None
    if not flags_obj.skip_eval:
        eval_output = model.evaluate(eval_input_dataset,
                                     steps=num_eval_steps,
                                     verbose=2)

    stats = common.build_stats(history, eval_output, callbacks)
    return stats
예제 #22
0
def parallel_train(train_model,dataset,config):
    '''Parallel train pipeline of openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = [200000,300000,360000,420000,480000,540000,600000,700000,800000,900000]
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval=config.log.log_interval
    save_interval=config.train.save_interval
    vis_dir=config.train.vis_dir
    
    #model hyper params
    n_pos = train_model.n_pos
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    model_dir = config.model.model_dir
    pretrain_model_dir=config.pretrain.pretrain_model_dir
    pretrain_model_path=f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer
    

    print(f"parallel training using learning rate:{lr_init} batch_size:{batch_size}")
    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset=dataset.get_train_dataset()
    dataset_type=dataset.get_dataset_type()
    parts,limbs,data_format=train_model.parts,train_model.limbs,train_model.data_format
    flip_list=get_flip_list(dataset_type)
    paramed_map_fn=get_paramed_map_fn(hin,win,hout,wout,parts,limbs,flip_list=flip_list,data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4)
    train_dataset = train_dataset.batch(batch_size)  
    train_dataset = train_dataset.prefetch(64)

    #train model configure 
    step=tf.Variable(1, trainable=False)
    lr=tf.Variable(lr_init,trainable=False)
    if(config.model.model_type==MODEL.Openpose):
        opt=tf.keras.optimizers.RMSprop(learning_rate=lr)
    else:
        opt=tf.keras.optimizers.Adam(learning_rate=lr)
    ckpt=tf.train.Checkpoint(step=step,optimizer=opt,lr=lr)
    ckpt_manager=tf.train.CheckpointManager(ckpt,model_dir,max_to_keep=3)

    #load from ckpt
    try:
        log("loading ckpt...")
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    #load pretrained backbone
    try:
        log("loading pretrained backbone...")
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,network=train_model.backbone,skip=True)
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized")
    #load model weights
    try:
        train_model.load_weights(os.path.join(model_dir,"newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")

    # KungFu configure
    kungfu_option=config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt=PairAveragingOptimizer(opt)
    

    n_step = n_step // current_cluster_size() + 1  # KungFu
    for step_idx,step in enumerate(lr_decay_steps):
        lr_decay_steps[step_idx] = step // current_cluster_size() + 1  # KungFu
    
    #optimize one step
    @tf.function
    def one_step(image,gt_label,mask,train_model,is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            gt_conf=gt_label[:,:n_pos,:,:]
            gt_paf=gt_label[:,n_pos:,:,:]
            pd_conf,pd_paf,stage_confs,stage_pafs=train_model.forward(image,is_train=True)

            pd_loss,loss_confs,loss_pafs=train_model.cal_loss(gt_conf,gt_paf,mask,stage_confs,stage_pafs)
            re_loss=regulize_loss(train_model,weight_decay_factor)
            total_loss=pd_loss+re_loss
        
        gradients=tape.gradient(total_loss,train_model.trainable_weights)
        opt.apply_gradients(zip(gradients,train_model.trainable_weights))
        #Kung fu
        if(is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        return gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss

    #train each step
    tic=time.time()
    train_model.train()
    log(f"Worker {current_rank()}: Initialized")
    log('Start - n_step: {} batch_size: {} lr_init: {} lr_decay_steps: {} lr_decay_factor: {}'.format(
            n_step, batch_size, lr_init, lr_decay_steps, lr_decay_factor))
    for image,gt_label,mask in train_dataset:
        #learning rate decay
        if(step in lr_decay_steps):
            new_lr_decay = lr_decay_factor**(float(lr_decay_steps.index(step)+1)) 
            lr=lr_init*new_lr_decay
        #optimize one step
        gt_conf,gt_paf,pd_conf,pd_paf,total_loss,re_loss=one_step(image.numpy(),gt_label.numpy(),mask.numpy(),\
            train_model,step==0)
        #save log info periodly
        if((step.numpy()!=0) and (step.numpy()%log_interval)==0):
            tic=time.time()
            log('Total Loss at iteration {} / {} is: {} Learning rate {} l2_loss {} time:{}'.format(
                step.numpy(), n_step, total_loss, lr.numpy(), re_loss,time.time()-tic))

        #save result and ckpt periodly
        if((step!=0) and (step%save_interval)==0 and current_rank()==0):
            log("saving model ckpt and result...")
            draw_results(image.numpy(), gt_conf.numpy(), pd_conf.numpy(), gt_paf.numpy(), pd_paf.numpy(), mask.numpy(),\
                 vis_dir,'train_%d_' % step)
            ckpt_save_path=ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            model_save_path=os.path.join(model_dir,"newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")

        #training finished
        if(step==n_step):
            break
    y_test = keras.utils.to_categorical(y_test, num_classes)
    scores = model.evaluate(x_test, y_test, verbose=1)
    return scores


def f_data(x_train, y_train):
    x_train = x_train.astype('float32')
    x_train /= 255
    y_train = tf.keras.utils.to_categorical(y_train, num_classes)

    return x_train, y_train


if __name__ == "__main__":
    logging.basicConfig(filename="tf2_Conv4_CIFAR10_exp_0.log",
                        level=logging.DEBUG,
                        format="%(asctime)s:%(levelname)s:%(message)s")
    # Load data
    (x_train, y_train), (x_test,
                         y_test) = tf.keras.datasets.cifar10.load_data()
    class_names = [
        "airplane", "automobile", "bird", "cat", "deer", "dog", "frog",
        "horse", "ship", "truck"
    ]
    # Pre process data
    x_train, y_train = preprocess_data.process(f_data, x_train, y_train)

    optimizer = build_optimizer('sync-sgd', n_workers=current_cluster_size())
    model = build_model(optimizer, x_train, num_classes)
    train_model(model, args.name, x_train, x_test, y_train, y_test)
예제 #24
0
def train(parallel, kungfu_option):
    Gab = models.get_G(name='Gab')
    Gba = models.get_G(name='Gba')
    Da = models.get_D(name='Da')
    Db = models.get_D(name='Db')

    Gab.train()
    Gba.train()
    Da.train()
    Db.train()

    lr_v = tf.Variable(flags.lr_init)
    # optimizer_Gab_Db = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1)
    # optimizer_Gba_Da = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1)
    # optimizer_G = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1)
    # optimizer_D = tf.optimizers.Adam(lr_v, beta_1=flags.beta_1)
    optimizer = tf.optimizers.Adam(
        lr_v, beta_1=flags.beta_1
    )  # use only one optimier, if your GPU memory is large

    use_ident = False

    # KungFu: wrap the optimizers
    if parallel:
        from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer
        if kungfu_option == 'sync-sgd':
            opt_fn = SynchronousSGDOptimizer
        elif kungfu_option == 'async-sgd':
            opt_fn = PairAveragingOptimizer
        elif kungfu_option == 'sma':
            opt_fn = SynchronousAveragingOptimizer
        else:
            raise RuntimeError('Unknown distributed training optimizer.')
        optimizer_Gab_Db = opt_fn(optimizer_Gab_Db)
        optimizer_Gba_Da = opt_fn(optimizer_Gba_Da)

    # Gab.load_weights(flags.model_dir + '/Gab.h5') # restore params?
    # Gba.load_weights(flags.model_dir + '/Gba.h5')
    # Da.load_weights(flags.model_dir + '/Da.h5')
    # Db.load_weights(flags.model_dir + '/Db.h5')

    # KungFu: shard the data
    if parallel:
        from kungfu import current_cluster_size, current_rank
        data_A_shard = []
        data_B_shard = []
        for step, (image_A, image_B) in enumerate(zip(data_A, data_B)):
            if step % current_cluster_size() == current_rank():
                data_A_shard.append(image_A)
                data_B_shard.append(image_B)
    else:
        data_A_shard = data_A
        data_B_shard = data_B

    @tf.function
    def train_step(image_A, image_B):
        fake_B = Gab(image_A)
        fake_A = Gba(image_B)
        cycle_A = Gba(fake_B)
        cycle_B = Gab(fake_A)
        if use_ident:
            iden_A = Gba(image_A)
            iden_B = Gab(image_B)
        logits_fake_B = Db(fake_B)  # TODO: missing image buffer (pool)
        logits_real_B = Db(image_B)
        logits_fake_A = Da(fake_A)
        logits_real_A = Da(image_A)
        # loss_Da = (tl.cost.mean_squared_error(logits_real_A, tf.ones_like(logits_real_A), is_mean=True) + \  # LSGAN
        #     tl.cost.mean_squared_error(logits_fake_A, tf.ones_like(logits_fake_A), is_mean=True)) / 2.
        loss_Da = tf.reduce_mean(tf.math.squared_difference(logits_fake_A, tf.zeros_like(logits_fake_A))) + \
            tf.reduce_mean(tf.math.squared_difference(logits_real_A, tf.ones_like(logits_real_A)))
        # loss_Da = tl.cost.sigmoid_cross_entropy(logits_fake_A, tf.zeros_like(logits_fake_A)) + \
        # tl.cost.sigmoid_cross_entropy(logits_real_A, tf.ones_like(logits_real_A))
        # loss_Db = (tl.cost.mean_squared_error(logits_real_B, tf.ones_like(logits_real_B), is_mean=True) + \ # LSGAN
        #     tl.cost.mean_squared_error(logits_fake_B, tf.ones_like(logits_fake_B), is_mean=True)) / 2.
        loss_Db = tf.reduce_mean(tf.math.squared_difference(logits_fake_B, tf.zeros_like(logits_fake_B))) + \
            tf.reduce_mean(tf.math.squared_difference(logits_real_B, tf.ones_like(logits_real_B)))
        # loss_Db = tl.cost.sigmoid_cross_entropy(logits_fake_B, tf.zeros_like(logits_fake_B)) + \
        #     tl.cost.sigmoid_cross_entropy(logits_real_B, tf.ones_like(logits_real_B))
        # loss_Gab = tl.cost.mean_squared_error(logits_fake_B, tf.ones_like(logits_fake_B), is_mean=True) # LSGAN
        loss_Gab = tf.reduce_mean(
            tf.math.squared_difference(logits_fake_B,
                                       tf.ones_like(logits_fake_B)))
        # loss_Gab = tl.cost.sigmoid_cross_entropy(logits_fake_B, tf.ones_like(logits_fake_B))
        # loss_Gba = tl.cost.mean_squared_error(logits_fake_A, tf.ones_like(logits_fake_A), is_mean=True) # LSGAN
        loss_Gba = tf.reduce_mean(
            tf.math.squared_difference(logits_fake_A,
                                       tf.ones_like(logits_fake_A)))
        # loss_Gba = tl.cost.sigmoid_cross_entropy(logits_fake_A, tf.ones_like(logits_fake_A))
        # loss_cyc = 10 * (tl.cost.absolute_difference_error(image_A, cycle_A, is_mean=True) + \
        #     tl.cost.absolute_difference_error(image_B, cycle_B, is_mean=True))
        loss_cyc = 10. * (tf.reduce_mean(tf.abs(image_A - cycle_A)) +
                          tf.reduce_mean(tf.abs(image_B - cycle_B)))

        if use_ident:
            loss_iden = 5. * (tf.reduce_mean(tf.abs(image_A - iden_A)) +
                              tf.reduce_mean(tf.abs(image_B - iden_B)))
        else:
            loss_iden = 0.

        loss_G = loss_Gab + loss_Gba + loss_cyc + loss_iden
        loss_D = loss_Da + loss_Db
        return loss_G, loss_D, loss_Gab, loss_Gba, loss_cyc, loss_iden, loss_Da, loss_Db, loss_D + loss_G

    for epoch in range(0, flags.n_epoch):
        # reduce lr linearly after 100 epochs, from lr_init to 0
        if epoch >= 100:
            new_lr = flags.lr_init - flags.lr_init * (epoch - 100) / 100
            lr_v.assign(lr_v, new_lr)
            print("New learning rate %f" % new_lr)

        # train 1 epoch
        for step, (image_A,
                   image_B) in enumerate(zip(data_A_shard, data_B_shard)):
            if image_A.shape[0] != flags.batch_size or image_B.shape[
                    0] != flags.batch_size:  # if the remaining data in this epoch < batch_size
                break
            step_time = time.time()
            with tf.GradientTape(persistent=True) as tape:
                # print(image_A.numpy().max())
                loss_G, loss_D, loss_Gab, loss_Gba, loss_cyc, loss_iden, loss_Da, loss_Db, loss_DG = train_step(
                    image_A, image_B)

            grad = tape.gradient(
                loss_DG, Gba.trainable_weights + Gab.trainable_weights +
                Da.trainable_weights + Db.trainable_weights)
            optimizer.apply_gradients(
                zip(
                    grad, Gba.trainable_weights + Gab.trainable_weights +
                    Da.trainable_weights + Db.trainable_weights))
            # grad = tape.gradient(loss_G, Gba.trainable_weights+Gab.trainable_weights)
            # optimizer_G.apply_gradients(zip(grad, Gba.trainable_weights+Gab.trainable_weights))
            # grad = tape.gradient(loss_D, Da.trainable_weights+Db.trainable_weights)
            # optimizer_D.apply_gradients(zip(grad, Da.trainable_weights+Db.trainable_weights))

            # del tape
            print("Epoch[{}/{}] step[{}/{}] time:{:.3f} Gab:{:.3f} Gba:{:.3f} cyc:{:.3f} iden:{:.3f} Da:{:.3f} Db:{:.3f}".format(\
                epoch, flags.n_epoch, step, n_step_per_epoch, time.time()-step_time, \
                loss_Gab, loss_Gba, loss_cyc, loss_iden, loss_Da, loss_Db))

            if parallel and step == 0:
                # KungFu: broadcast is done after the first gradient step to ensure optimizer initialization.
                from kungfu.tensorflow.initializer import broadcast_variables

                # Broadcast model variables
                broadcast_variables(Gab.trainable_weights)
                broadcast_variables(Gba.trainable_weights)
                broadcast_variables(Da.trainable_weights)
                broadcast_variables(Db.trainable_weights)

                # Broadcast optimizer variables
                broadcast_variables(optimizer_Gab.variables())
                broadcast_variables(optimizer_Gba.variables())
                broadcast_variables(optimizer_Da.variables())
                broadcast_variables(optimizer_Db.variables())

        if parallel:
            from kungfu import current_rank
            is_chief = current_rank() == 0
        else:
            is_chief = True

        # Let the chief worker to do visuliazation and checkpoints.
        if is_chief:
            # visualization

            # outb = Gab(sample_A)
            # outa = Gba(sample_B)
            # tl.vis.save_images(outb.numpy(), [1, 5], flags.sample_dir+'/{}_a2b.png'.format(epoch))
            # tl.vis.save_images(outa.numpy(), [1, 5], flags.sample_dir+'/{}_b2a.png'.format(epoch))

            outb_list = []  # do it one by one in case your GPU memory is low
            for i in range(len(sample_A)):
                outb = Gab(sample_A[i][np.newaxis, :, :, :])
                outb_list.append(outb.numpy()[0])

            outa_list = []
            for i in range(len(sample_B)):
                outa = Gba(sample_B[i][np.newaxis, :, :, :])
                outa_list.append(outa.numpy()[0])
            tl.vis.save_images(np.asarray(outb_list), [1, 5],
                               flags.sample_dir + '/{}_a2b.png'.format(epoch))
            tl.vis.save_images(np.asarray(outa_list), [1, 5],
                               flags.sample_dir + '/{}_b2a.png'.format(epoch))

            # save models
            if epoch % 5:
                Gab.save_weights(flags.model_dir + '/Gab.h5')
                Gba.save_weights(flags.model_dir + '/Gba.h5')
                Da.save_weights(flags.model_dir + '/Da.h5')
                Db.save_weights(flags.model_dir + '/Db.h5')
예제 #25
0
def parallel_train(training_dataset, kungfu_option):
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    ds = training_dataset.shuffle(buffer_size=4096)
    ds = ds.shard(num_shards=current_cluster_size(), index=current_rank())
    ds = ds.repeat(n_epoch)
    ds = ds.map(_map_fn, num_parallel_calls=4)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=1)

    iterator = ds.make_one_shot_iterator()
    one_element = iterator.get_next()
    net, total_loss, log_tensors = make_model(*one_element,
                                              is_train=True,
                                              reuse=False)
    x_ = net.img  # net input
    last_conf = net.last_conf  # net output
    last_paf = net.last_paf  # net output
    confs_ = net.confs  # GT
    pafs_ = net.pafs  # GT
    mask = net.m1  # mask1, GT
    # net.m2 = m2                 # mask2, GT
    stage_losses = net.stage_losses
    l2_loss = net.l2_loss

    global_step = tf.Variable(1, trainable=False)
    # scaled_lr = lr_init * current_cluster_size()  # Horovod: scale the learning rate linearly
    scaled_lr = lr_init  # Linear scaling rule is not working in openpose training.
    with tf.variable_scope('learning_rate'):
        lr_v = tf.Variable(scaled_lr, trainable=False)

    opt = tf.train.MomentumOptimizer(lr_v, 0.9)

    # KungFu
    if kungfu_option == 'sync-sgd':
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == 'async-sgd':
        opt = PairAveragingOptimizer(opt)
    elif kungfu_option == 'sma':
        opt = SynchronousAveragingOptimizer(opt)
    else:
        raise RuntimeError('Unknown distributed training optimizer.')

    train_op = opt.minimize(total_loss, global_step=global_step)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # KungFu
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    bcast = BroadcastGlobalVariablesOp()

    global n_step, lr_decay_every_step
    n_step = n_step // current_cluster_size() + 1  # KungFu
    lr_decay_every_step = lr_decay_every_step // current_cluster_size(
    ) + 1  # KungFu

    # Start training
    with tf.Session(config=config) as sess:
        init.run()
        bcast.run()  # KungFu
        print('Worker{}: Initialized'.format(current_rank()))
        print(
            'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}'
            .format(current_rank(), n_step, batch_size, lr_init,
                    lr_decay_every_step))

        # restore pre-trained weights
        try:
            # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net)
            tl.files.load_and_assign_npz_dict(sess=sess,
                                              name=os.path.join(
                                                  model_path, 'pose.npz'))
        except:
            print("no pre-trained model")

        # train until the end
        while True:
            step = sess.run(global_step)
            if step == n_step:
                break

            tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'
                .format(current_rank(), step, n_step, _loss, lr, _l2,
                        time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', current_rank(), 'Network#', ix,
                      'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if current_rank() == 0:  # KungFu
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [
                        img_out, confs_ground, pafs_ground, conf_result,
                        paf_result, mask_out
                    ] = sess.run(
                        [x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result,
                                 pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
                    # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path,
                                               'pose' + str(step) + '.npz'),
                                           sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path, 'pose.npz'),
                                           sess=sess)
예제 #26
0
def parallel_train(train_model, dataset, config):
    '''Parallel train pipeline of PoseProposal class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''
    init_log(config)
    #train hyper params
    #dataset params
    n_step = config.train.n_step
    batch_size = config.train.batch_size
    #learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    weight_decay_factor = config.train.weight_decay_factor
    #log and checkpoint params
    log_interval = config.log.log_interval
    save_interval = config.train.save_interval
    vis_dir = config.train.vis_dir

    #model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    hnei = train_model.hnei
    wnei = train_model.wnei
    model_dir = config.model.model_dir

    #import kungfu
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    print(
        f"parallel training using learning rate:{lr_init} batch_size:{batch_size}"
    )
    #training dataset configure with shuffle,augmentation,and prefetch
    train_dataset = dataset.get_train_dataset()
    parts, limbs, data_format = train_model.parts, train_model.limbs, train_model.data_format
    paramed_map_fn = get_paramed_map_fn(hin, win, hout, wout, hnei, wnei,
                                        parts, limbs, data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096)
    train_dataset = train_dataset.shard(num_shards=current_cluster_size(),
                                        index=current_rank())
    train_dataset = train_dataset.repeat()
    train_dataset = train_dataset.map(paramed_map_fn, num_parallel_calls=4)
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.prefetch(buffer_size=2)

    #train model configure
    step = tf.Variable(1, trainable=False)
    lr = tf.Variable(lr_init, trainable=False)
    opt = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.9)
    ckpt = tf.train.Checkpoint(step=step, optimizer=opt, lr=lr)
    ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3)

    #load from ckpt
    try:
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    try:
        train_model.load_weights(os.path.join(model_dir, "newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")

    #Kungfu configure
    kungfu_option = config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt = PairAveragingOptimizer(opt)

    n_step = n_step // current_cluster_size() + 1  # KungFu

    #optimize one step
    @tf.function
    def one_step(image, targets, train_model, is_first_batch=False):
        step.assign_add(1)
        with tf.GradientTape() as tape:
            delta, tx, ty, tw, th, te, te_mask = targets
            pc, pi, px, py, pw, ph, pe = train_model.forward(image,
                                                             is_train=True)
            loss_rsp,loss_iou,loss_coor,loss_size,loss_limb=\
                train_model.cal_loss(delta,tx,ty,tw,th,te,te_mask,pc,pi,px,py,pw,ph,pe)
            pd_loss = loss_rsp + loss_iou + loss_coor + loss_size + loss_limb
            re_loss = regulize_loss(train_model, weight_decay_factor)
            total_loss = pd_loss + re_loss

        gradients = tape.gradient(total_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(gradients, train_model.trainable_weights))
        #Kung fu
        if (is_first_batch):
            broadcast_variables(train_model.all_weights)
            broadcast_variables(opt.variables())
        predicts = (pc, px, py, pw, ph, pe)
        return predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb

    #train each step
    tic = time.time()
    train_model.train()
    log(f"Worker {current_rank()}: Initialized")
    log(f'Start - n_step: {n_step} batch_size: {batch_size} lr_init: {lr_init} lr_decay_factor: {lr_decay_factor}'
        )
    avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0.
    for image, targets in train_dataset:
        #learning rate decay
        lr = lr_init * (1 - step / n_step * lr_decay_factor)
        #optimize one step
        predicts, targets, pd_loss, re_loss, loss_rsp, loss_iou, loss_coor, loss_size, loss_limb = one_step(
            image, targets, train_model)

        avg_loss_rsp += loss_rsp / log_interval
        avg_loss_iou += loss_iou / log_interval
        avg_loss_coor += loss_coor / log_interval
        avg_loss_size += loss_size / log_interval
        avg_loss_limb += loss_limb / log_interval
        avg_pd_loss += pd_loss / log_interval
        avg_re_loss += re_loss / log_interval

        #save log info periodly
        if ((step != 0) and (step % log_interval) == 0):
            tic = time.time()
            log(f"worker:{current_rank()} Train iteration {step.numpy()}/{n_step}, learning rate:{lr.numpy()},"+\
                    f"loss_rsp:{avg_loss_rsp},loss_iou:{avg_loss_iou},loss_coor:{avg_loss_coor},loss_size:{avg_loss_size},"+\
                        f"loss_limb:{avg_loss_limb},loss_pd:{avg_pd_loss},loss_re:{avg_re_loss} ,time:{time.time()-tic}")
            avg_loss_rsp, avg_loss_iou, avg_loss_coor, avg_loss_size, avg_loss_limb, avg_pd_loss, avg_re_loss = 0., 0., 0., 0., 0., 0., 0.

        #save result and ckpt periodly
        if ((step != 0) and (step % save_interval) == 0):
            log("saving model ckpt and result...")
            draw_results(image.numpy(),
                         predicts,
                         targets,
                         parts,
                         limbs,
                         save_dir=vis_dir,
                         name=f"ppn_step_{step.numpy()}")
            ckpt_save_path = ckpt_manager.save()
            log(f"ckpt save_path:{ckpt_save_path} saved!\n")
            model_save_path = os.path.join(model_dir, "newest_model.npz")
            train_model.save_weights(model_save_path)
            log(f"model save_path:{model_save_path} saved!\n")

        #training finished
        if (step == n_step):
            break