def build_ops(): init_step = int(_get_init_step()) print('init_step is %d' % (init_step)) step = counter(init_step) schedule = step_based_schedule(config, step) ckpt_tensor = tf.as_string(step + 1) resize_op = resize_cluster(ckpt_tensor, schedule) return init_step, resize_op
def _build_resize_op(self, config, init_step): step = counter(init_step) new_size = step_based_schedule(config, step) ckpt_tensor = tf.as_string(step + 1) resize_op = resize_cluster(ckpt_tensor, new_size) return resize_op
print('[W] not scheduled for %d' % (i)) return old x = tf.Variable(tf.ones([], dtype=tf.int32)) y = all_reduce(x) def restore(checkpoint): gs = int(checkpoint) return gs ckpt = tf.placeholder(tf.string) new_size = tf.placeholder(tf.int32) resize_op = resize_cluster(ckpt, new_size) init = tf.global_variables_initializer() # barrier_op = barrier() with tf.Session() as sess: sess.run(init) init_gs = restore(_get_init_step()) np = current_cluster_size() init_np = get_cluster_size(init_gs, cluster_size_schedule, np) if np != init_np: print( '[W] init cluster size (np=%d) is not consistent with schedule (np=%d)' % (np, init_np))