Exemplo n.º 1
0
def _keras_callback_on_batch_end(callback, batch, logs=None):
    """broadcast should be done after the first gradient step to ensure optimizer initialization."""
    if callback.broadcast_done:
        return

    if _tf_major_version == 2:
        if hasattr(callback.model, 'variables'):
            for v in callback.model.variables:
                _tf_assign(v, broadcast(v))

            opt_variables = None
            if hasattr(callback.model.optimizer, 'variables'):
                opt_variables = callback.model.optimizer.variables()
            else:
                opt_variables = callback.model.optimizer.optimizer.variables()

            # print(opt_variables)
            for v in opt_variables:
                _tf_assign(v, broadcast(v))
        else:
            raise RuntimeError('No variables() in %s', callback.model)

    if _tf_major_version == 1:
        tf.keras.backend.get_session().run(BroadcastGlobalVariablesOp())

    callback.broadcast_done = True
Exemplo n.º 2
0
def BroadcastGlobalVariablesOp():
    """A TensorFlow operator that broadcasts global variables.

    This operator if often used with the low-level tf.Session
    """
    ops = [tf.assign(v, broadcast(v)) for v in tf.global_variables()]
    return tf.group(ops)
Exemplo n.º 3
0
def broadcast_variables(variables):
    """A TensorFlow function that broadcasts global variables.

    This function is often used with ``tf.GradientTape`` or embedded as part of a training program.
    """
    for v in variables:
        _tf_assign(v, broadcast(v))
Exemplo n.º 4
0
    def distributed_initializer(self):
        bcast_ops = []
        for v in self.variables():
            bcast_ops.append(tf.assign(v, broadcast(v)))

        with tf.control_dependencies(bcast_ops):
            with tf.control_dependencies([self._save_model_op]):
                return barrier()
Exemplo n.º 5
0
def test_broadcast():
    v = tf.Variable(True if current_rank() == 0 else False)
    u = broadcast(v)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        x = sess.run(v)
        y = sess.run(u)
        print(x,y)
Exemplo n.º 6
0
def test_broadcast():
    from kungfu.tensorflow.ops import broadcast
    v = tf.Variable(True if current_rank() == 0 else False)
    u = broadcast(v)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        x = sess.run(v)
        y = sess.run(u)
        # print(x, y)
        assert (y == True)
Exemplo n.º 7
0
def build_ops():
    optimizer = build_optimizer()

    x = tf.Variable(1.0, tf.float32)
    y = x * x
    train_step = optimizer.minimize(y)

    sync_op = tf.assign(x, broadcast(x))
    init_op = tf.global_variables_initializer()

    return init_op, sync_op, train_step, y
Exemplo n.º 8
0
def test_set_tree(steps, warmup_steps=10):
    from kungfu.python import current_cluster_size
    from kungfu.tensorflow.ops import all_reduce, broadcast
    from kungfu.tensorflow.ops.adapt import set_tree

    n = current_cluster_size()

    tree_place = tf.placeholder(dtype=tf.int32, shape=(n, ))
    set_tree_op = set_tree(broadcast(tree_place))

    magic = 32
    x = tf.Variable(list(range(magic)), dtype=tf.int32)
    y = all_reduce(x)

    init = tf.global_variables_initializer()

    durations = []
    with tf.Session() as sess:
        sess.run(init)
        from kungfu._utils import one_based_range
        for step in one_based_range(steps + warmup_steps):
            v = sess.run(y)
            assert (v.sum() == n * magic * (magic - 1) / 2)
            # print(v)

            tree = gen_tree(n)
            t0 = time.time()
            sess.run(set_tree_op, feed_dict={
                tree_place: tree,
            })
            dur = time.time() - t0

            if step > warmup_steps:
                durations.append(dur)

    ds = np.array([d * 1000 for d in durations])
    from kungfu._utils import show_duration
    print(
        'test set_tree OK for %d times among %d peers, took ~ %f <- [%f, %f] (ms)'
        % (len(ds), n, ds.mean(), ds.min(), ds.max()))
Exemplo n.º 9
0
config, max_step = get_config()


def build_ops():
    step_place = tf.placeholder(dtype=tf.int32, shape=())
    new_step_op = step_based_schedule(config, step_place)
    resize_op = resize_cluster_from_url()
    return step_place, resize_op, new_step_op


step_place, resize_op, new_step_op = build_ops()
sync_step_op = all_reduce(step_place, op='max')
x = tf.Variable(1, tf.int32)
y = all_reduce(x)

sync_state_op = tf.assign(x, broadcast(x))
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    need_sync = True
    i = 0
    while i < max_step:
        if need_sync:
            new_step = sess.run(sync_step_op, feed_dict={step_place: i})
            print('sync step: %d -> %d' % (i, new_step))
            i = new_step
            sess.run(sync_state_op)

        print(i)
        v = sess.run(y)
Exemplo n.º 10
0
def build_ops():
    init_step = int(_get_init_step())
    print('init_step is %d' % (init_step))

    step = counter(init_step)
    schedule = step_based_schedule(config, step)
    ckpt_tensor = tf.as_string(step + 1)
    resize_op = resize_cluster(ckpt_tensor, schedule)
    return init_step, resize_op


init_step, step_op = build_ops()
x = tf.Variable(1, tf.int32)
y = all_reduce(x)

sync_op = tf.assign(x, broadcast(x))
init_op = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init_op)
    need_sync = True
    for i in range(init_step, max_step):
        if need_sync:
            sess.run(sync_op)

        print(i)
        v = sess.run(y)
        print('step %d, np=%d' % (i, v))

        # must be called exactly once per step
        need_sync, keep = sess.run(step_op)
Exemplo n.º 11
0
 def begin(self):
     from kungfu.tensorflow.ops import broadcast
     self._ops = [tf.assign(v, broadcast(v)) for v in tf.global_variables()]