示例#1
0
def build_ops():
    init_step = int(_get_init_step())
    print('init_step is %d' % (init_step))

    step = counter(init_step)
    schedule = step_based_schedule(config, step)
    ckpt_tensor = tf.as_string(step + 1)
    resize_op = resize_cluster(ckpt_tensor, schedule)
    return init_step, resize_op
示例#2
0
 def _build_resize_op(self, config, init_step):
     step = counter(init_step)
     new_size = step_based_schedule(config, step)
     ckpt_tensor = tf.as_string(step + 1)
     resize_op = resize_cluster(ckpt_tensor, new_size)
     return resize_op
示例#3
0
    print('[W] not scheduled for %d' % (i))
    return old


x = tf.Variable(tf.ones([], dtype=tf.int32))
y = all_reduce(x)


def restore(checkpoint):
    gs = int(checkpoint)
    return gs


ckpt = tf.placeholder(tf.string)
new_size = tf.placeholder(tf.int32)
resize_op = resize_cluster(ckpt, new_size)

init = tf.global_variables_initializer()

# barrier_op = barrier()

with tf.Session() as sess:
    sess.run(init)

    init_gs = restore(_get_init_step())
    np = current_cluster_size()
    init_np = get_cluster_size(init_gs, cluster_size_schedule, np)
    if np != init_np:
        print(
            '[W] init cluster size (np=%d) is not consistent with schedule (np=%d)'
            % (np, init_np))