예제 #1
0
def worker(rank):
    import kungfu.torch as kf
    from kungfu.python import current_cluster_size, current_rank
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))
    x = torch.ones([]) * int(current_rank())
    print(x)
    y = kf.ops.collective.all_reduce_fn(x)
    print(y)
예제 #2
0
def get_neighbour_mask(edges):
    """Compute a bool vector of neighbours for the current peer.

    For the peer of rank i, v[j] = true if (i, j) is an edge of the MST,
    otherwise v[j] = false.
    """
    return _op_lib.kungfu_get_neighbour_mask(
        edges, self_rank=current_rank(), cluster_size=current_cluster_size())
예제 #3
0
def test_broadcast():
    from kungfu.tensorflow.ops import broadcast
    v = tf.Variable(True if current_rank() == 0 else False)
    u = broadcast(v)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        x = sess.run(v)
        y = sess.run(u)
        # print(x, y)
        assert (y == True)
예제 #4
0
def test_all_gather(device='cpu'):
    rank = current_rank()
    x = (torch.ones([2, 3]) * rank)
    x.to(device)
    y = kf.ops.collective.all_gather(x)
    z = []
    np = current_cluster_size()
    for i in range(np):
        z.append(torch.ones([2, 3]) * i)
    z = torch.stack(z)
    assert (z.equal(y))
예제 #5
0
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KungFu: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KungFu: broadcast the global variable
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    sess.run(BroadcastGlobalVariablesOp())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
예제 #6
0
def test_group_all_gather():
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import all_gather
    rank = current_rank()
    np = current_cluster_size()
    sizes = [i + 1 for i in range(5)]
    xs = [(rank + 1) * tf.Variable(tf.ones([n], tf.int32)) for n in sizes]
    ys = [all_gather(x) for x in xs]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i, y in enumerate(ys):
            v = sess.run(y)
            assert (v.sum() == (np + 1) * np / 2 * (i + 1))
예제 #7
0
def worker(rank):
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import all_reduce
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))
    x = tf.Variable(tf.ones(shape=(), dtype=tf.int32))
    y = all_reduce(x * rank)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(y)
        print('v=%s' % (v))
예제 #8
0
def test_consensus():
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import consensus

    np = current_cluster_size()
    rank = current_rank()

    x = tf.Variable(rank, dtype=tf.int32)
    consensus_check = consensus(x)

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(consensus_check)

        assert v == (np == 1)
예제 #9
0
def train_model(model, dataset, n_epochs=1, batch_size=5000):
    n_shards = current_cluster_size()
    shard_id = current_rank()
    train_data_size = len(dataset['x_train'])

    # calculate the offset for the data of the KungFu node
    shard_size = train_data_size // n_shards
    offset = batch_size * shard_id

    # extract the data for learning of the KungFu node
    x = dataset['x_train'][offset:offset + shard_size]
    y = dataset['y_train'][offset:offset + shard_size]
    # train the model
    model.fit(x,
              y,
              batch_size=batch_size,
              epochs=n_epochs,
              callbacks=[BroadcastGlobalVariablesCallback()],
              validation_data=(dataset['x_val'], dataset['y_val']),
              verbose=2)
예제 #10
0
def main(_):
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesHook
    hooks = [
        BroadcastGlobalVariablesHook(),
        tf.train.LoggingTensorHook(['train_accuracy', 'train_loss'],
                                   every_n_iter=10)
    ]

    from kungfu.python import current_rank
    save_checkpoints_secs = None if current_rank() != 0 else 30
    config = tf.estimator.RunConfig(
        save_checkpoints_secs=save_checkpoints_secs)

    mnist_classifier = tf.estimator.Estimator(model_fn=model_function,
                                              model_dir=FLAGS.model_dir,
                                              config=config)

    for _ in range(FLAGS.num_epochs):
        mnist_classifier.train(
            input_fn=train_data,
            hooks=hooks,
        )
        mnist_classifier.evaluate(input_fn=eval_data)
예제 #11
0
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \
                        preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer):
    '''Single train pipeline of Openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    # train hyper params
    # dataset params
    total_step = config.train.n_step
    batch_size = config.train.batch_size
    # learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = [
        200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000,
        900000
    ]
    weight_decay_factor = config.train.weight_decay_factor
    # log and checkpoint params
    log_interval = config.log.log_interval
    vis_interval = config.train.vis_interval
    save_interval = config.train.save_interval
    vis_dir = config.train.vis_dir

    # model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors
    data_format = train_model.data_format
    model_dir = config.model.model_dir
    pretrain_model_dir = config.pretrain.pretrain_model_dir
    pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    # metrics
    metric_manager = MetricManager()

    # initializing train dataset
    train_dataset = dataset.get_train_dataset()
    epoch_size = dataset.get_train_datasize() // batch_size
    paramed_map_fn = get_paramed_map_fn(augmentor=augmentor,
                                        preprocessor=preprocessor,
                                        data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096).repeat()
    train_dataset = train_dataset.map(
        paramed_map_fn, num_parallel_calls=get_num_parallel_calls())
    train_dataset = train_dataset.batch(config.train.batch_size)
    train_dataset = train_dataset.prefetch(3)
    train_dataset_iter = iter(train_dataset)

    #train configure
    save_step = tf.Variable(1, trainable=False)
    save_lr = tf.Variable(lr_init, trainable=False)
    opt = tf.keras.optimizers.Adam(learning_rate=save_lr)
    domainadapt_flag = config.data.domainadapt_flag
    total_epoch = total_step // epoch_size

    #domain adaptation params
    if (not domainadapt_flag):
        ckpt = tf.train.Checkpoint(save_step=save_step,
                                   save_lr=save_lr,
                                   opt=opt)
    else:
        log("Domain adaptaion in training enabled!")
        # weight param
        lambda_adapt = 1e-4
        # construct discrminator model
        feature_hin = train_model.hin // train_model.backbone.scale_size
        feature_win = train_model.win // train_model.backbone.scale_size
        in_channels = train_model.backbone.out_channels
        adapt_dis = Discriminator(feature_hin,
                                  feature_win,
                                  in_channels,
                                  data_format=data_format)
        opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr)
        ckpt = tf.train.Checkpoint(save_step=save_step,
                                   save_lr=save_lr,
                                   opt=opt,
                                   opt_d=opt_d)
        # construct domain adaptation dataset
        dmadapt_train_dataset = dataset.get_dmadapt_train_dataset()
        paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor)
        dmadapt_train_dataset = dmadapt_train_dataset.map(
            paramed_dmadapt_map_fn,
            num_parallel_calls=get_num_parallel_calls())
        dmadapt_train_dataset = dmadapt_train_dataset.shuffle(
            buffer_size=4096).repeat()
        dmadapt_train_dataset = dmadapt_train_dataset.batch(
            config.train.batch_size)
        dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3)
        dmadapt_train_dataset_iter = iter(dmadapt_train_dataset)

    #load from ckpt
    ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3)
    try:
        log("loading ckpt...")
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    #load pretrained backbone
    try:
        log("loading pretrained backbone...")
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,
                                          network=train_model.backbone,
                                          skip=True)
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized"
            )
    #load model weights
    try:
        log("loading saved training model weights...")
        train_model.load_weights(os.path.join(model_dir, "newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")
    if (domainadapt_flag):
        try:
            log("loading saved domain adaptation discriminator weight...")
            adapt_dis.load_weights(
                os.path.join(model_dir, "newest_discriminator.npz"))
        except:
            log("discriminator path doesn't exist, discriminator parameters are initialized"
                )

    log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}"
        )
    step = save_step.numpy()
    lr = save_lr.numpy()

    #import kungfu
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    total_step = total_step // current_cluster_size() + 1  # KungFu
    total_epoch = total_epoch // current_cluster_size() + 1  # KungFu
    for step_idx, decay_step in enumerate(lr_decay_steps):
        lr_decay_steps[
            step_idx] = decay_step // current_cluster_size() + 1  # KungFu

    # optimize one step
    def optimize_step(image, mask, target_x, train_model,
                      metric_manager: MetricManager):
        # tape
        with tf.GradientTape() as tape:
            predict_x = train_model.forward(x=image,
                                            is_train=True,
                                            ret_backbone=domainadapt_flag)
            total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \
                                                        mask=mask, metric_manager=metric_manager)
        # optimize model
        gradients = tape.gradient(total_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(gradients, train_model.trainable_weights))
        return predict_x

    def optimize_step_dmadapt(image_src, image_dst, train_model,
                              adapt_dis: Discriminator,
                              metric_manager: MetricManager):
        # tape
        with tf.GradientTape(persistent=True) as tape:
            # feature extraction
            # src feature
            predict_src = train_model.forward(x=image_src,
                                              is_train=True,
                                              ret_backbone=True)
            backbone_feature_src = predict_src["backbone_features"]
            adapt_pd_src = adapt_dis.forward(backbone_feature_src)
            # dst feature
            predict_dst = train_model.forward(x=image_dst,
                                              is_train=True,
                                              ret_backbone=True)
            backbone_feature_dst = predict_dst["backbone_features"]
            adapt_pd_dst = adapt_dis.forward(backbone_feature_dst)

            # loss calculation
            # loss of g
            g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst,
                                              label=True) * lambda_adapt
            # loss of d
            d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True)
            d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False)
            d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2

        # optimize model
        g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(g_gradient, train_model.trainable_weights))
        metric_manager.update("model/g_adapt_loss", g_adapt_loss)
        # optimize dis
        d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights)
        opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights))
        metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src)
        metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst)
        # delete persistent tape
        del tape
        return predict_dst

    # formal training procedure

    # KungFu configure
    kungfu_option = config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt = PairAveragingOptimizer(opt)

    train_model.train()
    cur_epoch = step // epoch_size + 1
    log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\
        +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\
        +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" )
    for epoch_idx in range(cur_epoch, total_epoch):
        log(f"Epoch {epoch_idx}/{total_epoch}:")
        for _ in tqdm(range(0, epoch_size)):
            step += 1
            metric_manager.start_timing()
            image, mask, target_list = next(train_dataset_iter)
            # extract gt_label
            target_list = [
                cPickle.loads(target) for target in target_list.numpy()
            ]
            target_x = {key: [] for key, value in target_list[0].items()}
            target_x = reduce(
                lambda x, y:
                {key: x[key] + [y[key]]
                 for key, value in x.items()}, [target_x] + target_list)
            target_x = {
                key: np.stack(value)
                for key, value in target_x.items()
            }
            target_x = to_tensor_dict(target_x)

            # learning rate decay
            if (step in lr_decay_steps):
                new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) +
                                                 1)
                lr = lr_init * new_lr_decay

            # optimize one step
            predict_x = optimize_step(image, mask, target_x, train_model,
                                      metric_manager)

            # optimize domain adaptation
            if (domainadapt_flag):
                src_image = image
                dst_image = next(dmadapt_train_dataset_iter)
                predict_dst = optimize_step_dmadapt(src_image, dst_image,
                                                    train_model, adapt_dis,
                                                    metric_manager)

            if (step == 1):
                broadcast_variables(train_model.all_weights)
                broadcast_variables(opt.variables())

            # log info periodly
            if ((step != 0) and (step % log_interval) == 0):
                log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\
                        +f"{metric_manager.report_train()} ")

            # visualize periodly
            if ((step != 0) and (step % vis_interval) == 0
                    and current_rank() == 0):
                log(f"Visualizing prediction maps and target maps")
                visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\
                                                    name=f"train_{step}")

            # save result and ckpt periodly
            if ((step != 0) and (step % save_interval) == 0
                    and current_rank() == 0):
                # save ckpt
                log("saving model ckpt and result...")
                save_step.assign(step)
                save_lr.assign(lr)
                ckpt_save_path = ckpt_manager.save()
                log(f"ckpt save_path:{ckpt_save_path} saved!\n")
                # save train model
                model_save_path = os.path.join(model_dir, "newest_model.npz")
                train_model.save_weights(model_save_path)
                log(f"model save_path:{model_save_path} saved!\n")
                # save discriminator model
                if (domainadapt_flag):
                    dis_save_path = os.path.join(model_dir,
                                                 "newest_discriminator.npz")
                    adapt_dis.save_weights(dis_save_path)
                    log(f"discriminator save_path:{dis_save_path} saved!\n")
예제 #12
0
if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt, with_keras=True)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt, with_keras=True)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt, with_keras=True)
else:
    raise RuntimeError('unknown optimizer: %s' % name)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

callbacks = [BroadcastGlobalVariablesCallback(with_keras=True)]

# KungFu: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if current_rank() == 0:
    callbacks.append(
        keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

model.fit(x_train,
          y_train,
          batch_size=batch_size,
          callbacks=callbacks,
          epochs=epochs,
          verbose=1 if current_rank() == 0 else 0,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
예제 #13
0
import tensorflow as tf
from kungfu.python import current_cluster_size, current_rank, run_barrier
from kungfu.tensorflow.optimizers import (PairAveragingOptimizer,
                                          SynchronousAveragingOptimizer,
                                          SynchronousSGDOptimizer)
from kungfu.tensorflow.initializer import BroadcastGlobalVariablesCallback

parser = argparse.ArgumentParser(description='KungFu mnist example.')
parser.add_argument('--kf-optimizer',
                    type=str,
                    default='sync-sgd',
                    help='available options: sync-sgd, async-sgd, sma')
args = parser.parse_args()

(x_train, y_train), _ = \
    tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % current_rank())

train_dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(x_train[..., tf.newaxis] / 255.0,
             tf.float32), tf.cast(y_train, tf.int64)))
train_dataset = train_dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
예제 #14
0
    def __init__(self, schedule):
        from kungfu.python import current_rank
        self._rank = current_rank()
        self._step = 0

        self._schedule = schedule
예제 #15
0
import tensorflow as tf
from kungfu.python import current_cluster_size, current_rank
from kungfu.tensorflow.optimizers import (PairAveragingOptimizer,
                                          SynchronousAveragingOptimizer,
                                          SynchronousSGDOptimizer)

parser = argparse.ArgumentParser(description='KungFu mnist example.')
parser.add_argument('--kf-optimizer',
                    type=str,
                    default='sync-sgd',
                    help='available options: sync-sgd, async-sgd, sma')
args = parser.parse_args()

(mnist_images, mnist_labels), _ = \
    tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % current_rank())

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
             tf.float32), tf.cast(mnist_labels, tf.int64)))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
예제 #16
0
def test_peer_info():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
예제 #17
0
 def local_next(self, bs):
     cur = self.global_next(bs)
     rank = kf.current_rank()
     size = kf.current_cluster_size()
     local = cur.partition(rank, size)
     return local
예제 #18
0
def worker(rank):
    from kungfu.python import current_cluster_size, current_rank
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))