예제 #1
0
파일: ada_sgd.py 프로젝트: zhz44/KungFu
 def __init__(self, optimizer, interval, name=None, use_locking=False):
     super(AdaptiveSGDOptimizer, self).__init__(optimizer,
                                                name,
                                                use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()
     self._step = tf.Variable(0, trainable=False, dtype=tf.int32)
     self._interval = interval
예제 #2
0
파일: mnist_slp.py 프로젝트: zhz44/KungFu
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KUNGFU: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KUNGFU: KungFu initilizer defines how model weights are initilised on distributed devices
    if hasattr(optimizer, 'distributed_initializer'):
        sess.run(optimizer.distributed_initializer())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
예제 #3
0
파일: async_sgd.py 프로젝트: zhz44/KungFu
    def apply_gradients(self, grads_and_vars, **kwargs):
        """Calls this same method on the underlying optimizer."""
        np, rank = current_cluster_size(), current_rank()
        target = get_random_peer(np, rank)
        variables = [v for _g, v in grads_and_vars]
        other_peer_vars, save_model_op = self._build_request_and_save_ops(
            target, variables)

        assign_ops = [
            tf.assign(v, 0.5 * (v + other_v))
            for v, other_v in zip(variables, other_peer_vars)
        ]

        apply_op = self._optimizer.apply_gradients(grads_and_vars, **kwargs)

        with tf.control_dependencies(assign_ops):
            with tf.control_dependencies([apply_op]):
                with tf.control_dependencies([save_model_op]):
                    return tf.group(apply_op)
예제 #4
0
def train_model(model, dataset, n_epochs=1, batch_size=5000):
    n_shards = current_cluster_size()
    shard_id = current_rank()
    train_data_size = len(dataset['x_train'])

    # calculate the offset for the data of the KungFu node
    shard_size = train_data_size // n_shards
    offset = batch_size * shard_id

    # extract the data for learning of the KungFu node
    x = dataset['x_train'][offset:offset + shard_size]
    y = dataset['y_train'][offset:offset + shard_size]
    # train the model
    model.fit(x,
              y,
              batch_size=batch_size,
              epochs=n_epochs,
              callbacks=[InitalizationCallback()],
              validation_data=(dataset['x_val'], dataset['y_val']),
              verbose=2)
예제 #5
0
def all_reduce_benchmark(sizes, dtype=tf.float32):
    xs = [tf.Variable(tf.ones([n], dtype)) for n in sizes]
    tot_size = sum(_tensor_size(x) for x in xs)
    np = current_cluster_size()
    multiplier = 4 * (np - 1)
    print('all reduce total size: %s among %d peers' %
          (show_size(tot_size), np))
    ys = group_all_reduce(xs)
    init = tf.global_variables_initializer()
    warmup_steps = 5
    bench_steps = 10
    with tf.Session() as sess:
        sess.run(init)
        for step in range(warmup_steps):
            sess.run(ys)
        for step in range(bench_steps):
            t0 = time.time()
            sess.run(ys)
            d = time.time() - t0
            rate = 0
            print('step %d, took %.2fs, equivalent data rate: %s' %
                  (step, d, show_rate(tot_size * multiplier, d)))
예제 #6
0
def fake_get_shard_info(use_kungfu):
    if use_kungfu:
        from kungfu.ops import current_cluster_size, current_rank
        return current_rank(), current_cluster_size()
    return 0, 1
예제 #7
0
def show_info_example():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
예제 #8
0
def test_peer_info():
    rank = current_rank()
    np = current_cluster_size()
    print('rank=%d, np=%d' % (rank, np))
예제 #9
0
    return gs


ckpt = tf.placeholder(tf.string)
new_size = tf.placeholder(tf.int32)
resize_op = resize_cluster(ckpt, new_size)

init = tf.global_variables_initializer()

# barrier_op = barrier()

with tf.Session() as sess:
    sess.run(init)

    init_gs = restore(get_init_checkpoint())
    np = current_cluster_size()
    init_np = get_cluster_size(init_gs, cluster_size_schedule, np)
    if np != init_np:
        print(
            '[W] init cluster size (np=%d) is not consistent with schedule (np=%d)'
            % (np, init_np))

    print('restored from %d, np=%d, init_np=%d, start took %s' %
          (init_gs, np, init_np, show_duration(time.time() - t0)))

    for gs in range(init_gs, max_step):
        t0 = time.time()
        v = sess.run(y)
        print('step %d, result: %d, np=%d, took %s' %
              (gs, v, np, show_duration(time.time() - t0)))
예제 #10
0
 def __init__(self, optimizer, name=None, use_locking=False):
     super(SyncModelAveragingSGDOptimizer,
           self).__init__(optimizer, name, use_locking=use_locking)
     self._num_workers = current_cluster_size()
     self._rank = current_rank()