Пример #1
0
    def begin(self):
        self._sync_op = BroadcastGlobalVariablesOp()

        self._step = 0
        self._step_place = tf.placeholder(dtype=tf.int32, shape=())
        self._sync_step_op = all_reduce(self._step_place, op='max')
        self._resize_op, self._new_size_op = self._build_resize_op(
            self._schedule, self._step_place)
Пример #2
0
    def begin(self):
        self._step = 0
        self._trained_samples = 0
        self._trained_samples_place = tf.placeholder(dtype=tf.int32, shape=())
        self._sync_offset_op = all_reduce(self._trained_samples_place,
                                          op='max')

        self._sync_state_op = BroadcastGlobalVariablesOp()
        self._resize_op = resize_cluster_from_url()
Пример #3
0
def all_reduce_example():
    x = tf.Variable(tf.ones([], tf.int32))
    y = all_reduce(x)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        for step in range(5):
            v = sess.run(y)
            print('step %d, result: %d' % (step, v))
Пример #4
0
def worker(rank):
    from kungfu.python import current_cluster_size, current_rank
    from kungfu.tensorflow.ops import all_reduce
    print('rank=%d' % (rank))
    print('kungfu rank: %d, size %d' %
          (current_rank(), current_cluster_size()))
    x = tf.Variable(tf.ones(shape=(), dtype=tf.int32))
    y = all_reduce(x * rank)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        v = sess.run(y)
        print('v=%s' % (v))
Пример #5
0
def main():
    # step -> new_size
    fake_schedule = {
        10: 2,
        20: 3,
        40: 4,
        50: 1,
    }
    args = parse_args()
    gs = tf.train.get_or_create_global_step()
    sync_step_op = tf.assign(gs, all_reduce(gs, op='max'))
    inc_gs = tf.assign_add(gs, 1)
    new_size = tf.placeholder(dtype=tf.uint32)
    resize_op = resize(new_size)
    train_op = build_fake_train_op(args.use_nccl)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        need_sync = True
        while True:
            if need_sync:
                sess.run(sync_step_op)
                need_sync = False

            step = sess.run(gs)

            # BEGIN train
            vs = sess.run(train_op)
            print('step %d, result: %d' % (step, vs[0].sum()))
            # END train

            if step in fake_schedule:
                changed = sess.run(resize_op,
                                   feed_dict={new_size: fake_schedule[step]})
                if changed:
                    need_sync = True
                    if detached():
                        break
                else:
                    print('cluster not changed')
                assert changed

            next_gs = sess.run(inc_gs)
            print('finished %s' % (next_gs - 1))
            if next_gs >= args.max_step:
                break

    print('stopped')
Пример #6
0
def test_set_tree(steps, warmup_steps=10):
    from kungfu.python import current_cluster_size
    from kungfu.tensorflow.ops import all_reduce, broadcast
    from kungfu.tensorflow.ops.adapt import set_tree

    n = current_cluster_size()

    tree_place = tf.placeholder(dtype=tf.int32, shape=(n, ))
    set_tree_op = set_tree(broadcast(tree_place))

    magic = 32
    x = tf.Variable(list(range(magic)), dtype=tf.int32)
    y = all_reduce(x)

    init = tf.global_variables_initializer()

    durations = []
    with tf.Session() as sess:
        sess.run(init)
        from kungfu._utils import one_based_range
        for step in one_based_range(steps + warmup_steps):
            v = sess.run(y)
            assert (v.sum() == n * magic * (magic - 1) / 2)
            # print(v)

            tree = gen_tree(n)
            t0 = time.time()
            sess.run(set_tree_op, feed_dict={
                tree_place: tree,
            })
            dur = time.time() - t0

            if step > warmup_steps:
                durations.append(dur)

    ds = np.array([d * 1000 for d in durations])
    from kungfu._utils import show_duration
    print(
        'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)'
        % (len(ds), n, ds.mean(), ds.min(), ds.max()))
Пример #7
0
def run(sess, train_op, bcast_op):
    if args.num_batches_per_iter > 1:
        print('--num-batches-per-iter == 1 is highly recommended, using %d' %
              (args.num_batches_per_iter))
    from kungfu.tensorflow.ops import all_reduce, resize_cluster_from_url
    step_place = tf.placeholder(dtype=tf.int32, shape=())
    sync_step_op = all_reduce(step_place, op='max')
    resize_op = resize_cluster_from_url()
    # Benchmark
    log('Running benchmark...')
    img_secs = []
    need_sync = True
    step = 0
    while step < args.num_iters:
        if need_sync:
            new_step = sess.run(sync_step_op, feed_dict={step_place: step})
            if new_step != step:
                print('sync step : %d -> %d' % (step, new_step))
            step = new_step
            if bcast_op:
                duration, _ = measure(lambda: session.run(bcast_op))
                log('bcast_op took %.3fs' % (duration))
            need_sync = False
        step += 1
        time = timeit.timeit(lambda: sess.run(train_op),
                             number=args.num_batches_per_iter)
        img_sec = args.batch_size / time
        log('Iter #%d: %.1f img/sec per %s' % (step, img_sec, device))
        img_secs.append(img_sec)

        changed, keep = sess.run(resize_op)
        if not keep:
            return
        if changed:
            need_sync = True

    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log_final_result(img_sec_mean, img_sec_conf)
Пример #8
0
    max_step = step_per_stage * len(stage_sizes)
    return config, max_step


config, max_step = get_config()


def build_ops():
    step_place = tf.placeholder(dtype=tf.int32, shape=())
    new_step_op = step_based_schedule(config, step_place)
    resize_op = resize_cluster_from_url()
    return step_place, resize_op, new_step_op


step_place, resize_op, new_step_op = build_ops()
sync_step_op = all_reduce(step_place, op='max')
x = tf.Variable(1, tf.int32)
y = all_reduce(x)

sync_state_op = tf.assign(x, broadcast(x))
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    need_sync = True
    i = 0
    while i < max_step:
        if need_sync:
            new_step = sess.run(sync_step_op, feed_dict={step_place: i})
            print('sync step: %d -> %d' % (i, new_step))
            i = new_step
Пример #9
0
cluster_size_schedule, max_step = parse_schedule(args.schedule)
# print(cluster_size_schedule)
# print(max_step)


def get_cluster_size(i, sch, old):
    for s, e, n in sch:
        if s <= i and i < e:
            return n
    print('[W] not scheduled for %d' % (i))
    return old


x = tf.Variable(tf.ones([], dtype=tf.int32))
y = all_reduce(x)


def restore(checkpoint):
    gs = int(checkpoint)
    return gs


new_size = tf.placeholder(tf.int32)
resize_op = resize_cluster_from_url()

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
        return '%.2fs' % duration
    sec = int(duration)
    mm, ss = sec / 60, sec % 60
    if duration < 3600:
        return '%dm%ds' % (mm, ss)
    return '%dh%dm%ds' % (mm / 60, mm % 60, ss)


# x = tf.Variable(tf.ones([], dtype=tf.int32))
x = tf.ones((10, 1), dtype=tf.int32)
print(x.numpy())

steps = 10
mean_time = []
for i in range(steps):

    # reshape strategy before AllReduce to bypass straggler node
    t1 = time.time()
    keep = reshape_strategy(debug=False)
    iteration_time = time.time() - t1
    print('reshape took %s' % (show_duration(iteration_time)))

    t0 = time.time()
    v = all_reduce(x)
    print('all reduce step %d, took %s' % (i, show_duration(time.time() - t0)))

    mean_time.append(iteration_time)
    if not keep:
        break
print(np.mean(mean_time))
Пример #11
0
 def begin(self):
     global_step = tf.train.get_or_create_global_step()
     new_global_step = all_reduce(global_step, op='max')
     self._sync_step_op = tf.assign(global_step, new_global_step)
     from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
     self._sync_state_op = BroadcastGlobalVariablesOp()