Пример #1
0
def train(args):
    step_based_schedule = {
        100: 2,
        200: 3,
        300: 4,
        400: 2,
        500: 3,
        600: 1,
    }
    ds = build_dataset(args)
    model, loss, opt = build_ops(args)
    need_sync = True
    total_samples = int(MNIST_DATA_SIZE * args.num_epochs)
    trained_samples = tf.Variable(0)
    global_step = tf.Variable(0)
    for local_step, (images, labels) in enumerate(ds):
        global_step.assign_add(1)
        trained_samples.assign_add(current_cluster_size() * args.batch_size)
        loss_value = training_step(model, loss, opt, images, labels)
        if need_sync:
            sync_offsets([global_step, trained_samples])
            sync_model(model, opt)
            need_sync = False
        step = int(global_step)
        print('step: %d loss: %f' % (step, loss_value))
        if step in step_based_schedule:
            new_size = step_based_schedule[step]
            need_sync = resize_cluster(new_size)
            if detached():
                break

        if trained_samples >= total_samples:
            break
Пример #2
0
def get_peer_latencies():
    """Returns the vector V of round-trip time from this peer to all other peers.

    For the peer of rank i, V[j] is the RTT from i to j (j != i), V[i] = 0.
    """
    return _op_lib.kungfu_get_peer_latencies(
        cluster_size=current_cluster_size())
Пример #3
0
def get_neighbour_mask(edges):
    """Compute a bool vector of neighbours for the current peer.

    For the peer of rank i, v[j] = true if (i, j) is an edge of the MST,
    otherwise v[j] = false.
    """
    return _op_lib.kungfu_get_neighbour_mask(
        edges, self_rank=current_rank(), cluster_size=current_cluster_size())
Пример #4
0
def test_all_reduce(device='cpu'):
    x = torch.ones([2, 3])
    x.to(device)
    y = kf.ops.collective.all_reduce_fn(x)
    assert (x.shape == y.shape)
    np = current_cluster_size()
    z = x * np
    assert z.equal(y)
Пример #5
0
def worker(rank):
    import kungfu.torch as kf
    from kungfu.python import current_cluster_size, current_rank
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))
    x = torch.ones([]) * int(current_rank())
    print(x)
    y = kf.ops.collective.all_reduce_fn(x)
    print(y)
Пример #6
0
def _cluster_size():
    if os.getenv('KUNGFU_SELF_SPEC'):
        from kungfu.python import current_cluster_size
        return current_cluster_size()
    else:
        try:
            import horovod.tensorflow as hvd
            return hvd.size()
        except:
            return 1
Пример #7
0
def test_all_gather(device='cpu'):
    rank = current_rank()
    x = (torch.ones([2, 3]) * rank)
    x.to(device)
    y = kf.ops.collective.all_gather(x)
    z = []
    np = current_cluster_size()
    for i in range(np):
        z.append(torch.ones([2, 3]) * i)
    z = torch.stack(z)
    assert (z.equal(y))
Пример #8
0
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KungFu: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KungFu: broadcast the global variable
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    sess.run(BroadcastGlobalVariablesOp())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
Пример #9
0
    def end(self):
        if self._new:
            return
        assert (self._begin is not None)

        dur = time.time() - self._begin
        new_size = current_cluster_size()

        print('resize %d -> %d took %s' %
              (self._old_size, new_size, show_duration(dur)))
        self._records.append((dur, self._old_size, new_size))
        self._begin = None
Пример #10
0
def worker(rank):
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import all_reduce
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))
    x = tf.Variable(tf.ones(shape=(), dtype=tf.int32))
    y = all_reduce(x * rank)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(y)
        print('v=%s' % (v))
Пример #11
0
def train_model(model, dataset, n_epochs=1, batch_size=5000):
    n_shards = current_cluster_size()
    shard_id = current_rank()
    train_data_size = len(dataset['x_train'])

    # calculate the offset for the data of the KungFu node
    shard_size = train_data_size // n_shards
    offset = batch_size * shard_id

    # extract the data for learning of the KungFu node
    x = dataset['x_train'][offset:offset + shard_size]
    y = dataset['y_train'][offset:offset + shard_size]
    # train the model
    model.fit(x,
              y,
              batch_size=batch_size,
              epochs=n_epochs,
              callbacks=[BroadcastGlobalVariablesCallback()],
              validation_data=(dataset['x_val'], dataset['y_val']),
              verbose=2)
Пример #12
0
    def after_run(self, run_context, run_values):
        self._step += 1
        np = current_cluster_size()
        self._trained_samples += self._local_batch_size * np

        self._profiler.begin()
        changed, keep = run_context.session.run(self._resize_op)
        if not keep:
            run_context.request_stop()
            self._exit_reason = 'change cluster'
            self._profiler.end()
            return
        if changed:
            self._need_sync = True
        else:
            self._profiler.cancel()

        if self._trained_samples >= self._total_samples:
            self._exit_reason = 'finished'
            run_context.request_stop()
Пример #13
0
    def after_run(self, run_context, run_values):
        sess = run_context.session
        bs = self.get_batch_size(sess)
        trained_samples = sess.run(self._trained_samples)
        trained_samples += bs * current_cluster_size()
        self._set_trained_samples(sess, trained_samples)
        self._trained_epochs = int(trained_samples / self._epoch_size)

        for policy in reversed(self._policies):
            policy.after_step(sess)

        if self._trained_epochs > self._last_trained_epochs:
            for policy in reversed(self._policies):
                policy.after_epoch(sess)

        if trained_samples >= self._total_samples:
            # print('%s' % 'request_stop ...')
            run_context.request_stop()

        if detached():
            run_context.request_stop()
Пример #14
0
def test_set_tree(steps, warmup_steps=10):
    from kungfu.python import current_cluster_size
    from kungfu.tensorflow.ops import all_reduce, broadcast
    from kungfu.tensorflow.ops.adapt import set_tree

    n = current_cluster_size()

    tree_place = tf.placeholder(dtype=tf.int32, shape=(n, ))
    set_tree_op = set_tree(broadcast(tree_place))

    magic = 32
    x = tf.Variable(list(range(magic)), dtype=tf.int32)
    y = all_reduce(x)

    init = tf.global_variables_initializer()

    durations = []
    with tf.Session() as sess:
        sess.run(init)
        from kungfu._utils import one_based_range
        for step in one_based_range(steps + warmup_steps):
            v = sess.run(y)
            assert (v.sum() == n * magic * (magic - 1) / 2)
            # print(v)

            tree = gen_tree(n)
            t0 = time.time()
            sess.run(set_tree_op, feed_dict={
                tree_place: tree,
            })
            dur = time.time() - t0

            if step > warmup_steps:
                durations.append(dur)

    ds = np.array([d * 1000 for d in durations])
    from kungfu._utils import show_duration
    print(
        'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)'
        % (len(ds), n, ds.mean(), ds.min(), ds.max()))
Пример #15
0
def build_optimizer(name, batch_size):
    learning_rate = 0.1

    # Scale learning rate according to the level of data parallelism
    optimizer = tf.train.GradientDescentOptimizer(learning_rate *
                                                  current_cluster_size())

    # KungFu: Wrap the TensorFlow optimizer with KungFu distributed optimizers.
    if name == 'sync-sgd':
        from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer
        return SynchronousSGDOptimizer(optimizer)
    elif name == 'async-sgd':
        from kungfu.tensorflow.optimizers import PairAveragingOptimizer
        return PairAveragingOptimizer(optimizer)
    elif name == 'sma':
        from kungfu.tensorflow.optimizers import SynchronousAveragingOptimizer
        return SynchronousAveragingOptimizer(optimizer)
    elif name == 'ada-sgd':
        from kungfu.tensorflow.optimizers import AdaptiveSGDOptimizer
        return AdaptiveSGDOptimizer(optimizer, change_step=300)
    else:
        raise RuntimeError('unknown optimizer: %s' % name)
Пример #16
0
def parallel_train(train_model, dataset, config, augmentor:BasicAugmentor, \
                        preprocessor:BasicPreProcessor,postprocessor:BasicPostProcessor,visualizer=BasicVisualizer):
    '''Single train pipeline of Openpose class models

    input model and dataset, the train pipeline will start automaticly
    the train pipeline will:
    1.store and restore ckpt in directory ./save_dir/model_name/model_dir
    2.log loss information in directory ./save_dir/model_name/log.txt
    3.visualize model output periodly during training in directory ./save_dir/model_name/train_vis_dir
    the newest model is at path ./save_dir/model_name/model_dir/newest_model.npz

    Parameters
    ----------
    arg1 : tensorlayer.models.MODEL
        a preset or user defined model object, obtained by Model.get_model() function
    
    arg2 : dataset
        a constructed dataset object, obtained by Dataset.get_dataset() function
    
    
    Returns
    -------
    None
    '''

    # train hyper params
    # dataset params
    total_step = config.train.n_step
    batch_size = config.train.batch_size
    # learning rate params
    lr_init = config.train.lr_init
    lr_decay_factor = config.train.lr_decay_factor
    lr_decay_steps = [
        200000, 300000, 360000, 420000, 480000, 540000, 600000, 700000, 800000,
        900000
    ]
    weight_decay_factor = config.train.weight_decay_factor
    # log and checkpoint params
    log_interval = config.log.log_interval
    vis_interval = config.train.vis_interval
    save_interval = config.train.save_interval
    vis_dir = config.train.vis_dir

    # model hyper params
    hin = train_model.hin
    win = train_model.win
    hout = train_model.hout
    wout = train_model.wout
    parts, limbs, colors = train_model.parts, train_model.limbs, train_model.colors
    data_format = train_model.data_format
    model_dir = config.model.model_dir
    pretrain_model_dir = config.pretrain.pretrain_model_dir
    pretrain_model_path = f"{pretrain_model_dir}/newest_{train_model.backbone.name}.npz"

    # metrics
    metric_manager = MetricManager()

    # initializing train dataset
    train_dataset = dataset.get_train_dataset()
    epoch_size = dataset.get_train_datasize() // batch_size
    paramed_map_fn = get_paramed_map_fn(augmentor=augmentor,
                                        preprocessor=preprocessor,
                                        data_format=data_format)
    train_dataset = train_dataset.shuffle(buffer_size=4096).repeat()
    train_dataset = train_dataset.map(
        paramed_map_fn, num_parallel_calls=get_num_parallel_calls())
    train_dataset = train_dataset.batch(config.train.batch_size)
    train_dataset = train_dataset.prefetch(3)
    train_dataset_iter = iter(train_dataset)

    #train configure
    save_step = tf.Variable(1, trainable=False)
    save_lr = tf.Variable(lr_init, trainable=False)
    opt = tf.keras.optimizers.Adam(learning_rate=save_lr)
    domainadapt_flag = config.data.domainadapt_flag
    total_epoch = total_step // epoch_size

    #domain adaptation params
    if (not domainadapt_flag):
        ckpt = tf.train.Checkpoint(save_step=save_step,
                                   save_lr=save_lr,
                                   opt=opt)
    else:
        log("Domain adaptaion in training enabled!")
        # weight param
        lambda_adapt = 1e-4
        # construct discrminator model
        feature_hin = train_model.hin // train_model.backbone.scale_size
        feature_win = train_model.win // train_model.backbone.scale_size
        in_channels = train_model.backbone.out_channels
        adapt_dis = Discriminator(feature_hin,
                                  feature_win,
                                  in_channels,
                                  data_format=data_format)
        opt_d = tf.keras.optimizers.Adam(learning_rate=save_lr)
        ckpt = tf.train.Checkpoint(save_step=save_step,
                                   save_lr=save_lr,
                                   opt=opt,
                                   opt_d=opt_d)
        # construct domain adaptation dataset
        dmadapt_train_dataset = dataset.get_dmadapt_train_dataset()
        paramed_dmadapt_map_fn = get_paramed_dmadapt_map_fn(augmentor)
        dmadapt_train_dataset = dmadapt_train_dataset.map(
            paramed_dmadapt_map_fn,
            num_parallel_calls=get_num_parallel_calls())
        dmadapt_train_dataset = dmadapt_train_dataset.shuffle(
            buffer_size=4096).repeat()
        dmadapt_train_dataset = dmadapt_train_dataset.batch(
            config.train.batch_size)
        dmadapt_train_dataset = dmadapt_train_dataset.prefetch(3)
        dmadapt_train_dataset_iter = iter(dmadapt_train_dataset)

    #load from ckpt
    ckpt_manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=3)
    try:
        log("loading ckpt...")
        ckpt.restore(ckpt_manager.latest_checkpoint)
    except:
        log("ckpt_path doesn't exist, step and optimizer are initialized")
    #load pretrained backbone
    try:
        log("loading pretrained backbone...")
        tl.files.load_and_assign_npz_dict(name=pretrain_model_path,
                                          network=train_model.backbone,
                                          skip=True)
    except:
        log("pretrained backbone doesn't exist, model backbone are initialized"
            )
    #load model weights
    try:
        log("loading saved training model weights...")
        train_model.load_weights(os.path.join(model_dir, "newest_model.npz"))
    except:
        log("model_path doesn't exist, model parameters are initialized")
    if (domainadapt_flag):
        try:
            log("loading saved domain adaptation discriminator weight...")
            adapt_dis.load_weights(
                os.path.join(model_dir, "newest_discriminator.npz"))
        except:
            log("discriminator path doesn't exist, discriminator parameters are initialized"
                )

    log(f"Parallel training using learning rate:{lr_init} batch_size:{batch_size}"
        )
    step = save_step.numpy()
    lr = save_lr.numpy()

    #import kungfu
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.initializer import broadcast_variables
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    total_step = total_step // current_cluster_size() + 1  # KungFu
    total_epoch = total_epoch // current_cluster_size() + 1  # KungFu
    for step_idx, decay_step in enumerate(lr_decay_steps):
        lr_decay_steps[
            step_idx] = decay_step // current_cluster_size() + 1  # KungFu

    # optimize one step
    def optimize_step(image, mask, target_x, train_model,
                      metric_manager: MetricManager):
        # tape
        with tf.GradientTape() as tape:
            predict_x = train_model.forward(x=image,
                                            is_train=True,
                                            ret_backbone=domainadapt_flag)
            total_loss = train_model.cal_loss(predict_x=predict_x, target_x=target_x, \
                                                        mask=mask, metric_manager=metric_manager)
        # optimize model
        gradients = tape.gradient(total_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(gradients, train_model.trainable_weights))
        return predict_x

    def optimize_step_dmadapt(image_src, image_dst, train_model,
                              adapt_dis: Discriminator,
                              metric_manager: MetricManager):
        # tape
        with tf.GradientTape(persistent=True) as tape:
            # feature extraction
            # src feature
            predict_src = train_model.forward(x=image_src,
                                              is_train=True,
                                              ret_backbone=True)
            backbone_feature_src = predict_src["backbone_features"]
            adapt_pd_src = adapt_dis.forward(backbone_feature_src)
            # dst feature
            predict_dst = train_model.forward(x=image_dst,
                                              is_train=True,
                                              ret_backbone=True)
            backbone_feature_dst = predict_dst["backbone_features"]
            adapt_pd_dst = adapt_dis.forward(backbone_feature_dst)

            # loss calculation
            # loss of g
            g_adapt_loss = adapt_dis.cal_loss(x=adapt_pd_dst,
                                              label=True) * lambda_adapt
            # loss of d
            d_adapt_loss_src = adapt_dis.cal_loss(x=adapt_pd_src, label=True)
            d_adapt_loss_dst = adapt_dis.cal_loss(x=adapt_pd_dst, label=False)
            d_adapt_loss = (d_adapt_loss_src + d_adapt_loss_dst) / 2

        # optimize model
        g_gradient = tape.gradient(g_adapt_loss, train_model.trainable_weights)
        opt.apply_gradients(zip(g_gradient, train_model.trainable_weights))
        metric_manager.update("model/g_adapt_loss", g_adapt_loss)
        # optimize dis
        d_gradients = tape.gradient(d_adapt_loss, adapt_dis.trainable_weights)
        opt_d.apply_gradients(zip(d_gradients, adapt_dis.trainable_weights))
        metric_manager.update("dis/d_adapt_loss_src", d_adapt_loss_src)
        metric_manager.update("dis/d_adapt_loss_dst", d_adapt_loss_dst)
        # delete persistent tape
        del tape
        return predict_dst

    # formal training procedure

    # KungFu configure
    kungfu_option = config.train.kungfu_option
    if kungfu_option == KUNGFU.Sync_sgd:
        print("using Kungfu.SynchronousSGDOptimizer!")
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == KUNGFU.Sync_avg:
        print("using Kungfu.SynchronousAveragingOptimize!")
        opt = SynchronousAveragingOptimizer(opt)
    elif kungfu_option == KUNGFU.Pair_avg:
        print("using Kungfu.PairAveragingOptimizer!")
        opt = PairAveragingOptimizer(opt)

    train_model.train()
    cur_epoch = step // epoch_size + 1
    log(f"Start Training- total_epoch: {total_epoch} total_step: {total_step} current_epoch:{cur_epoch} "\
        +f"current_step:{step} batch_size:{batch_size} lr_init:{lr_init} lr_decay_steps:{lr_decay_steps} "\
        +f"lr_decay_factor:{lr_decay_factor} weight_decay_factor:{weight_decay_factor}" )
    for epoch_idx in range(cur_epoch, total_epoch):
        log(f"Epoch {epoch_idx}/{total_epoch}:")
        for _ in tqdm(range(0, epoch_size)):
            step += 1
            metric_manager.start_timing()
            image, mask, target_list = next(train_dataset_iter)
            # extract gt_label
            target_list = [
                cPickle.loads(target) for target in target_list.numpy()
            ]
            target_x = {key: [] for key, value in target_list[0].items()}
            target_x = reduce(
                lambda x, y:
                {key: x[key] + [y[key]]
                 for key, value in x.items()}, [target_x] + target_list)
            target_x = {
                key: np.stack(value)
                for key, value in target_x.items()
            }
            target_x = to_tensor_dict(target_x)

            # learning rate decay
            if (step in lr_decay_steps):
                new_lr_decay = lr_decay_factor**(lr_decay_steps.index(step) +
                                                 1)
                lr = lr_init * new_lr_decay

            # optimize one step
            predict_x = optimize_step(image, mask, target_x, train_model,
                                      metric_manager)

            # optimize domain adaptation
            if (domainadapt_flag):
                src_image = image
                dst_image = next(dmadapt_train_dataset_iter)
                predict_dst = optimize_step_dmadapt(src_image, dst_image,
                                                    train_model, adapt_dis,
                                                    metric_manager)

            if (step == 1):
                broadcast_variables(train_model.all_weights)
                broadcast_variables(opt.variables())

            # log info periodly
            if ((step != 0) and (step % log_interval) == 0):
                log(f"Train Epoch={epoch_idx} / {total_epoch}, Step={step} / {total_step}: learning_rate: {lr:.6e} {metric_manager.report_timing()}\n"\
                        +f"{metric_manager.report_train()} ")

            # visualize periodly
            if ((step != 0) and (step % vis_interval) == 0
                    and current_rank() == 0):
                log(f"Visualizing prediction maps and target maps")
                visualizer.visual_compare(image_batch=image.numpy(), mask_batch=mask.numpy(), predict_x=predict_x, target_x=target_x,\
                                                    name=f"train_{step}")

            # save result and ckpt periodly
            if ((step != 0) and (step % save_interval) == 0
                    and current_rank() == 0):
                # save ckpt
                log("saving model ckpt and result...")
                save_step.assign(step)
                save_lr.assign(lr)
                ckpt_save_path = ckpt_manager.save()
                log(f"ckpt save_path:{ckpt_save_path} saved!\n")
                # save train model
                model_save_path = os.path.join(model_dir, "newest_model.npz")
                train_model.save_weights(model_save_path)
                log(f"model save_path:{model_save_path} saved!\n")
                # save discriminator model
                if (domainadapt_flag):
                    dis_save_path = os.path.join(model_dir,
                                                 "newest_discriminator.npz")
                    adapt_dis.save_weights(dis_save_path)
                    log(f"discriminator save_path:{dis_save_path} saved!\n")
Пример #17
0
    def begin(self):
        assert (self._begin is None)
        self._new = False

        self._begin = time.time()
        self._old_size = current_cluster_size()
Пример #18
0
def all_gather(x):
    np = current_cluster_size()
    y = x.new(torch.Size([np] + list(x.shape)))
    all_gather_op_map[x.type()](x, y, x.type())
    return y
Пример #19
0
mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])
loss = tf.losses.SparseCategoricalCrossentropy()

# KungFu: adjust learning rate based on number of GPUs.
# opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

# KungFu: wrap tf.compat.v1.train.Optimizer.
if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt)
else:
    raise RuntimeError('Unknown KungFu optimizer')


@tf.function
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
Пример #20
0
             tf.float32), tf.cast(y_train, tf.int64)))
train_dataset = train_dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])

# KungFu: adjust learning rate based on number of GPUs.
opt = tf.keras.optimizers.SGD(0.001 * current_cluster_size())
# opt = tf.compat.v1.train.AdamOptimizer(0.001 * current_cluster_size())

if args.kf_optimizer == 'sync-sgd':
    opt = SynchronousSGDOptimizer(opt)
elif args.kf_optimizer == 'async-sgd':
    opt = PairAveragingOptimizer(opt)
elif args.kf_optimizer == 'sma':
    opt = SynchronousAveragingOptimizer(opt)
else:
    raise RuntimeError('Unknown KungFu optimizer')

mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
                    optimizer=opt,
                    metrics=['accuracy'])
Пример #21
0
parser = argparse.ArgumentParser(description='Keras MNIST example.')
parser.add_argument('--kf-optimizer',
                    type=str,
                    default='sync-sgd',
                    help='kungfu optimizer')
args = parser.parse_args()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
K.set_session(tf.Session(config=config))

batch_size = 128
num_classes = 10

# KungFu: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(4.0 / current_cluster_size()))

# Input image dimensions
img_rows, img_cols = 28, 28

# The data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)
Пример #22
0
 def local_next(self, bs):
     cur = self.global_next(bs)
     rank = kf.current_rank()
     size = kf.current_cluster_size()
     local = cur.partition(rank, size)
     return local
Пример #23
0
def test_peer_info():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
Пример #24
0
def worker(rank):
    from kungfu.python import current_cluster_size, current_rank
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))