def testWaitForSessionLocalInit(self): server = server_lib.Server.create_local_server() with ops.Graph().as_default() as graph: v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") sm = session_manager.SessionManager( graph=graph, ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=w.initializer) # Initialize v but not w s = session_lib.Session(server.target, graph=graph) s.run(v.initializer) sess = sm.wait_for_session(server.target, max_wait_secs=3) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEquals(1, sess.run(v)) self.assertEquals(1, sess.run(w))
def testPrepareSessionWithReadyForLocalInitOp(self): with ops.Graph().as_default(): v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=w.initializer) sess = sm2.prepare_session("", init_op=v.initializer) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEquals(1, sess.run(v)) self.assertEquals(1, sess.run(w))
def testPrepareSessionFails(self): checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session") checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2") try: gfile.DeleteRecursively(checkpoint_dir) gfile.DeleteRecursively(checkpoint_dir2) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with ops.Graph().as_default(): v = variables.Variable([1.0, 2.0, 3.0], name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess = sm.prepare_session( "", init_op=variables.global_variables_initializer(), saver=saver, checkpoint_dir=checkpoint_dir) self.assertAllClose([1.0, 2.0, 3.0], sess.run(v)) checkpoint_filename = os.path.join(checkpoint_dir, "prepare_session_checkpoint") saver.save(sess, checkpoint_filename) # Create a new Graph and SessionManager and recover. with ops.Graph().as_default(): # Renames the checkpoint directory. os.rename(checkpoint_dir, checkpoint_dir2) gfile.MakeDirs(checkpoint_dir) v = variables.Variable([6.0, 7.0, 8.0], name="v") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) # This should fail as there's no checkpoint within 2 seconds. with self.assertRaisesRegexp( RuntimeError, "no init_op or init_fn or local_init_op was given"): sess = sm.prepare_session( "", init_op=None, saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=True, max_wait_secs=2) # Rename the checkpoint directory back. gfile.DeleteRecursively(checkpoint_dir) os.rename(checkpoint_dir2, checkpoint_dir) # This should succeed as there's checkpoint. sess = sm.prepare_session( "", init_op=None, saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=True, max_wait_secs=2) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self): # We use ready_for_local_init_op=tf.report_uninitialized_variables(), # which causes recover_session to not run local_init_op, and to return # initialized=False # Create a checkpoint. checkpoint_dir = os.path.join( self.get_temp_dir(), "recover_session_ready_for_local_init_fails_to_ready_local") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with ops.Graph().as_default(): v = variables.Variable(1, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess, initialized = sm.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) sess.run(v.initializer) self.assertEquals(1, sess.run(v)) saver.save(sess, os.path.join(checkpoint_dir, "recover_session_checkpoint")) # Create a new Graph and SessionManager and recover. with ops.Graph().as_default(): v = variables.Variable(2, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables(), local_init_op=w.initializer) saver = saver_lib.Saver({"v": v}) sess, initialized = sm2.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( False, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEquals(1, sess.run(v))
def testRecoverSessionNoChkptStillRunsLocalInitOp(self): # This test checks for backwards compatibility. # In particular, we continue to ensure that recover_session will execute # local_init_op exactly once, regardless of whether the session was # successfully recovered. with ops.Graph().as_default(): w = variables.Variable( 1, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) # Try to recover session from None sess, initialized = sm2.recover_session( "", saver=None, checkpoint_dir=None) # Succeeds because recover_session still run local_init_op self.assertFalse(initialized) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEquals(1, sess.run(w))
def testRecoverSession(self): # Create a checkpoint. checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with ops.Graph().as_default(): v = variables.Variable(1, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess, initialized = sm.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) sess.run(v.initializer) self.assertEquals(1, sess.run(v)) saver.save(sess, os.path.join(checkpoint_dir, "recover_session_checkpoint")) self._test_recovered_variable(checkpoint_dir=checkpoint_dir) self._test_recovered_variable( checkpoint_filename_with_path=checkpoint_management.latest_checkpoint( checkpoint_dir)) # Cannot set both checkpoint_dir and checkpoint_filename_with_path. with self.assertRaises(ValueError): self._test_recovered_variable( checkpoint_dir=checkpoint_dir, checkpoint_filename_with_path=checkpoint_management.latest_checkpoint( checkpoint_dir))
def _init_ready_op(self, ready_op=USE_DEFAULT, ready_for_local_init_op=USE_DEFAULT): """Initializes ready_op. Args: ready_op: `Tensor` to check if the model is initialized. If it's set to USE_DEFAULT, creates an op that checks all the variables are initialized. ready_for_local_init_op: `Tensor` to check if the model is ready to run local_init_op. If it's set to USE_DEFAULT, creates an op that checks all the global variables are initialized. """ if ready_op is Supervisor.USE_DEFAULT: ready_op = self._get_first_op_from_collection(ops.GraphKeys.READY_OP) if ready_op is None: ready_op = variables.report_uninitialized_variables() ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op) self._ready_op = ready_op # ready_for_local_init_op defaults to None for backward compatibility if ready_for_local_init_op is Supervisor.USE_DEFAULT: ready_for_local_init_op = self._get_first_op_from_collection( ops.GraphKeys.READY_FOR_LOCAL_INIT_OP) self._ready_for_local_init_op = ready_for_local_init_op
def testRecoverSessionNoChkptStillRunsLocalInitOp(self): # This test checks for backwards compatibility. # In particular, we continue to ensure that recover_session will execute # local_init_op exactly once, regardless of whether the session was # successfully recovered. with ops.Graph().as_default(): w = variables.VariableV1( 1, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) # Try to recover session from None sess, initialized = sm2.recover_session("", saver=None, checkpoint_dir=None) # Succeeds because recover_session still run local_init_op self.assertFalse(initialized) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEqual(1, sess.run(w))
def testRecoverSession(self): # Create a checkpoint. checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) with ops.Graph().as_default(): v = variables.VariableV1(1, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess, initialized = sm.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir) self.assertFalse(initialized) sess.run(v.initializer) self.assertEqual(1, sess.run(v)) saver.save( sess, os.path.join(checkpoint_dir, "recover_session_checkpoint")) self._test_recovered_variable(checkpoint_dir=checkpoint_dir) self._test_recovered_variable( checkpoint_filename_with_path=checkpoint_management. latest_checkpoint(checkpoint_dir)) # Cannot set both checkpoint_dir and checkpoint_filename_with_path. with self.assertRaises(ValueError): self._test_recovered_variable( checkpoint_dir=checkpoint_dir, checkpoint_filename_with_path=checkpoint_management. latest_checkpoint(checkpoint_dir))
def _init_ready_op(self, ready_op=USE_DEFAULT, ready_for_local_init_op=USE_DEFAULT): """Initializes ready_op. Args: ready_op: `Tensor` to check if the model is initialized. If it's set to USE_DEFAULT, creates an op that checks all the variables are initialized. ready_for_local_init_op: `Tensor` to check if the model is ready to run local_init_op. If it's set to USE_DEFAULT, creates an op that checks all the global variables are initialized. """ if ready_op is Supervisor.USE_DEFAULT: ready_op = self._get_first_op_from_collection( ops.GraphKeys.READY_OP) if ready_op is None: ready_op = variables.report_uninitialized_variables() ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op) self._ready_op = ready_op # ready_for_local_init_op defaults to None for backward compatibility if ready_for_local_init_op is Supervisor.USE_DEFAULT: ready_for_local_init_op = self._get_first_op_from_collection( ops.GraphKeys.READY_FOR_LOCAL_INIT_OP) self._ready_for_local_init_op = ready_for_local_init_op
def get_session(is_chief): g = ops.Graph() with g.as_default(): with ops.device("/job:localhost"): v = variables.VariableV1( 1, name="default_ready_for_local_init_op_v_" + str(uid)) vadd = v.assign_add(1) w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="default_ready_for_local_init_op_w_" + str(uid)) ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) sv = supervisor.Supervisor( logdir=logdir, is_chief=is_chief, graph=g, recovery_wait_secs=1, init_op=v.initializer, ready_for_local_init_op=ready_for_local_init_op) sess = sv.prepare_or_wait_for_session(server.target) return sv, sess, v, vadd, w
def testWaitForSessionLocalInit(self): server = server_lib.Server.create_local_server() with ops.Graph().as_default() as graph: v = variables.VariableV1(1, name="v") w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") sm = session_manager.SessionManager( graph=graph, ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables. report_uninitialized_variables(variables.global_variables()), local_init_op=w.initializer) # Initialize v but not w s = session_lib.Session(server.target, graph=graph) s.run(v.initializer) sess = sm.wait_for_session(server.target, max_wait_secs=3) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEquals(1, sess.run(v)) self.assertEquals(1, sess.run(w))
def testPrepareSessionSucceedsWithInitFn(self): with ops.Graph().as_default(): v = variables.Variable([125], name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session( "", init_fn=lambda sess: sess.run(v.initializer)) self.assertAllClose([125], sess.run(v))
def testPrepareSessionSucceeds(self): with ops.Graph().as_default(): v = variables.Variable([1.0, 2.0, 3.0], name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session( "", init_op=variables.global_variables_initializer()) self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
def testAssertVariablesInitialized(self): with ops.Graph().as_default(), self.cached_session() as sess: v = variables.Variable([1, 2], name="v") w = variables.Variable([3, 4], name="w") _ = v, w uninited = variables.report_uninitialized_variables() self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited)) self.evaluate(variables.global_variables_initializer()) self.assertEqual(0, self.evaluate(uninited).size)
def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self): with ops.Graph().as_default() as graph: v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") sm = session_manager.SessionManager( graph=graph, ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables(), local_init_op=w.initializer) with self.assertRaises(errors_impl.DeadlineExceededError): # Time-out because w fails to be initialized, # because of overly restrictive ready_for_local_init_op sm.wait_for_session("", max_wait_secs=3)
def testAssertVariablesInitialized(self): with ops.Graph().as_default(), self.cached_session() as sess: v = variables.Variable([1, 2], name="v") w = variables.Variable([3, 4], name="w") _ = v, w uninited = variables.report_uninitialized_variables() self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited)) variables.global_variables_initializer().run() self.assertEqual(0, self.evaluate(uninited).size)
def testWaitForSessionWithReadyForLocalInitOpFailsToReadyLocal(self): with ops.Graph().as_default() as graph: v = variables.VariableV1(1, name="v") w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") sm = session_manager.SessionManager( graph=graph, ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables(), local_init_op=w.initializer) with self.assertRaises(errors_impl.DeadlineExceededError): # Time-out because w fails to be initialized, # because of overly restrictive ready_for_local_init_op sm.wait_for_session("", max_wait_secs=3)
def testWaitForSessionReturnsNoneAfterTimeout(self): with ops.Graph().as_default(): variables.Variable(1, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), recovery_wait_secs=1) # Set max_wait_secs to allow us to try a few times. with self.assertRaises(errors.DeadlineExceededError): sm.wait_for_session(master="", max_wait_secs=3)
def testInitWithNoneLocalInitOpError(self): # Creating a SessionManager with a None local_init_op but # non-None ready_for_local_init_op raises ValueError with self.assertRaisesRegexp(ValueError, "If you pass a ready_for_local_init_op " "you must also pass a local_init_op "): session_manager.SessionManager( ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=None)
def testInitWithNoneLocalInitOpError(self): # Creating a SessionManager with a None local_init_op but # non-None ready_for_local_init_op raises ValueError with self.assertRaisesRegex( ValueError, "If you pass a ready_for_local_init_op " "you must also pass a local_init_op "): session_manager.SessionManager( ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=None)
def testVariableList(self): with ops.Graph().as_default(), self.cached_session() as sess: v = variables.VariableV1([1, 2], name="v") w = variables.VariableV1([3, 4], name="w") uninited = variables.report_uninitialized_variables() self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited)) self.evaluate(w.initializer) self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited)) v.initializer.run() self.assertEqual(0, self.evaluate(uninited).size)
def testPrepareSessionSucceedsWithLocalInitFeedDict(self): with ops.Graph().as_default(): p = array_ops.placeholder(dtypes.float32, shape=(3, )) v = variables.VariableV1( p, name="v", collections=[ops.GraphKeys.LOCAL_VARIABLES]) sm = session_manager.SessionManager( local_init_op=v.initializer, local_init_feed_dict={p: [1.0, 2.0, 3.0]}, ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session("") self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
def testPrepareSessionWithReadyNotReadyForLocal(self): with ops.Graph().as_default(): v = variables.VariableV1(1, name="v") w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=w.initializer) with self.assertRaisesRegex( RuntimeError, "Init operations did not make model ready for local_init"): sm2.prepare_session("", init_op=None)
def testPrepareSessionSucceedsWithInitFeedDict(self): with ops.Graph().as_default(): p = array_ops.placeholder(dtypes.float32, shape=(3, )) v = variables.Variable(p, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session( "", init_op=variables.global_variables_initializer(), init_feed_dict={p: [1.0, 2.0, 3.0]}) self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
def testPrepareSessionWithReadyNotReadyForLocal(self): with ops.Graph().as_default(): v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=w.initializer) with self.assertRaisesRegexp( RuntimeError, "Init operations did not make model ready for local_init"): sm2.prepare_session("", init_op=None)
def testPrepareSessionSucceedsWithInitFeedDict(self): with ops.Graph().as_default(): p = array_ops.placeholder(dtypes.float32, shape=(3,)) v = variables.Variable(p, name="v") sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session( "", init_op=variables.global_variables_initializer(), init_feed_dict={p: [1.0, 2.0, 3.0]}) self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
def testPrepareSessionWithReadyForLocalInitOp(self): with ops.Graph().as_default(): v = variables.VariableV1(1, name="v") w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") x = variables.VariableV1( 3 * v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="x") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) self.assertEqual(False, variables.is_variable_initialized(x).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=variables.report_uninitialized_variables( variables.global_variables()), local_init_op=[w.initializer, x.initializer]) sess = sm2.prepare_session("", init_op=v.initializer) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("x:0")).eval(session=sess)) self.assertEqual(1, sess.run(v)) self.assertEqual(1, sess.run(w)) self.assertEqual(3, sess.run(x))
def _init_ready_op(self, ready_op=USE_DEFAULT): """Initializes ready_op. Args: ready_op: `Tensor` to check if the model is initialized. If it's set to USE_DEFAULT, creates an op that checks all the variables are initialized. """ if ready_op is Supervisor.USE_DEFAULT: ready_op = self._get_first_op_from_collection(ops.GraphKeys.READY_OP) if ready_op is None: ready_op = variables.report_uninitialized_variables() ops.add_to_collection(ops.GraphKeys.READY_OP, ready_op) self._ready_op = ready_op
def _between_graph_worker_fn(self, strategy): context = distribute_coordinator_context.get_current_worker_context() self.assertTrue(context is not None) with self._test_session(target=context.master_target) as sess: with ops.device("/job:ps/task:0"): # TODO(yuefengz): investigate why not using resource variable will make # the test flaky. x = variable_scope.get_variable("x", initializer=10.0, use_resource=True) with ops.device("/job:ps/task:1"): y = variable_scope.get_variable("y", initializer=20.0, use_resource=True) x_add = x.assign_add(2.0) y_sub = y.assign_sub(2.0) train_op = control_flow_ops.group([x_add, y_sub]) if context.is_chief: variables.global_variables_initializer().run() # Synchronize workers after initializaton. if context.has_barrier: context.wait_for_other_workers() else: while True: uninit_vars = sess.run( variables.report_uninitialized_variables()) # pylint: disable=g-explicit-length-test if len(uninit_vars) == 0: break sess.run(train_op) # Synchronize workers after one step to make sure they all have finished # training. if context.has_barrier: context.wait_for_other_workers() else: self._barrier.wait() x_val, y_val = sess.run([x, y]) self.assertEqual(x_val, 16.0) self.assertEqual(y_val, 14.0) if x_val == 16.0 and y_val == 14.0: with self._lock: self._result_correct += 1
def testWaitForSessionInsufficientReadyForLocalInitCheck(self): with ops.Graph().as_default() as graph: v = variables.Variable(1, name="v") w = variables.Variable(v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") sm = session_manager.SessionManager( graph=graph, ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) with self.assertRaisesRegexp(errors_impl.DeadlineExceededError, "Session was not ready after waiting.*"): sm.wait_for_session("", max_wait_secs=3)
def test_evaluate_ready_for_local_init(self): with ops.Graph().as_default() as g, self.test_session(g): variables_lib.create_global_step() v = variables.Variable(1.0) variables.Variable(v + 1, collections=[ops.GraphKeys.LOCAL_VARIABLES], trainable=False) ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) ops.add_to_collection(ops.GraphKeys.READY_FOR_LOCAL_INIT_OP, ready_for_local_init_op) _ = learn.graph_actions.evaluate(g, output_dir=self._output_dir, checkpoint_path=None, eval_dict={'a': v}, max_steps=1)
def testPrepareSessionDidNotInitLocalVariableList(self): with ops.Graph().as_default(): v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) with self.assertRaisesRegexp(RuntimeError, "Init operations did not make model ready"): sm2.prepare_session("", init_op=[v.initializer])
def testPrepareSessionDidNotInitLocalVariableList(self): with ops.Graph().as_default(): v = variables.VariableV1(1, name="v") w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) with self.assertRaisesRegex(RuntimeError, "Init operations did not make model ready"): sm2.prepare_session("", init_op=[v.initializer])
def testWaitForSessionInsufficientReadyForLocalInitCheck(self): with ops.Graph().as_default() as graph: v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") sm = session_manager.SessionManager( graph=graph, ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) with self.assertRaisesRegexp(errors_impl.DeadlineExceededError, "Session was not ready after waiting.*"): sm.wait_for_session("", max_wait_secs=3)
def test_evaluate_ready_for_local_init(self): with ops.Graph().as_default() as g, self.test_session(g): variables_lib.create_global_step() v = variables.Variable(1.0) w = variables.Variable( v + 1, collections=[ops.GraphKeys.LOCAL_VARIABLES], trainable=False) ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) ops.add_to_collection(ops.GraphKeys.READY_FOR_LOCAL_INIT_OP, ready_for_local_init_op) _ = learn.graph_actions.evaluate( g, output_dir=self._output_dir, checkpoint_path=None, eval_dict={'a': v}, max_steps=1)
def testPrepareSessionWithCyclicInitializer(self): # Regression test. Previously Variable._build_initializer_expr would enter # into an infinite recursion when the variable's initial_value involved # cyclic dependencies. with ops.Graph().as_default(): i = control_flow_ops.while_loop(lambda i: i < 1, lambda i: i + 1, [0]) v = variables.VariableV1(array_ops.identity(i), name="v") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session("", init_op=v.initializer) self.assertEqual(1, sess.run(v)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
def _between_graph_worker_fn(self, strategy): context = distribute_coordinator_context.get_current_worker_context() self.assertTrue(context is not None) with self._test_session(target=context.master_target) as sess: with ops.device("/job:ps/task:0"): # TODO(yuefengz): investigate why not using resource variable will make # the test flaky. x = variable_scope.get_variable( "x", initializer=10.0, use_resource=True) with ops.device("/job:ps/task:1"): y = variable_scope.get_variable( "y", initializer=20.0, use_resource=True) x_add = x.assign_add(2.0) y_sub = y.assign_sub(2.0) train_op = control_flow_ops.group([x_add, y_sub]) if context.is_chief: self.evaluate(variables.global_variables_initializer()) # Synchronize workers after initializaton. if context.has_barrier: context.wait_for_other_workers() else: while True: uninit_vars = sess.run(variables.report_uninitialized_variables()) # pylint: disable=g-explicit-length-test if len(uninit_vars) == 0: break sess.run(train_op) # Synchronize workers after one step to make sure they all have finished # training. if context.has_barrier: context.wait_for_other_workers() else: self._barrier.wait() x_val, y_val = sess.run([x, y]) self.assertEqual(x_val, 16.0) self.assertEqual(y_val, 14.0) if x_val == 16.0 and y_val == 14.0: with self._lock: self._result_correct += 1
def testPrepareSessionWithCyclicInitializer(self): # Regression test. Previously Variable._build_initializer_expr would enter # into an infinite recursion when the variable's initial_value involved # cyclic dependencies. with ops.Graph().as_default(): i = control_flow_ops.while_loop(lambda i: i < 1, lambda i: i + 1, [0]) v = variables.Variable(array_ops.identity(i), name="v") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) sm = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) sess = sm.prepare_session("", init_op=v.initializer) self.assertEqual(1, sess.run(v)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self): with ops.Graph().as_default(): v = variables.Variable(1, name="v") w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.test_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) with self.assertRaisesRegexp(errors_impl.FailedPreconditionError, "Attempting to use uninitialized value v"): sm2.prepare_session("", init_op=None)
def testRecoverSessionFailsStillRunsLocalInitOp(self): # Create a checkpoint. checkpoint_dir = os.path.join( self.get_temp_dir(), "recover_session_ready_for_local_init_fails_stil_run") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) # Create a new Graph and SessionManager and recover. with ops.Graph().as_default(): v = variables.VariableV1(2, name="v") w = variables.VariableV1( 1, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) saver = saver_lib.Saver({"v": v}) sess, initialized = sm2.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=False) self.assertFalse(initialized) self.assertEqual( False, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEqual(1, sess.run(w))
def testRecoverSessionFailsStillRunsLocalInitOp(self): # Create a checkpoint. checkpoint_dir = os.path.join( self.get_temp_dir(), "recover_session_ready_for_local_init_fails_stil_run") try: gfile.DeleteRecursively(checkpoint_dir) except errors.OpError: pass # Ignore gfile.MakeDirs(checkpoint_dir) # Create a new Graph and SessionManager and recover. with ops.Graph().as_default(): v = variables.Variable(2, name="v") w = variables.Variable( 1, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="w") with self.cached_session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) self.assertEqual(False, variables.is_variable_initialized(w).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables(), ready_for_local_init_op=None, local_init_op=w.initializer) saver = saver_lib.Saver({"v": v}) sess, initialized = sm2.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir, wait_for_checkpoint=False) self.assertFalse(initialized) self.assertEqual( False, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("w:0")).eval(session=sess)) self.assertEquals(1, sess.run(w))
def _test_recovered_variable(self, checkpoint_dir=None, checkpoint_filename_with_path=None): # Create a new Graph and SessionManager and recover from a checkpoint. with ops.Graph().as_default(): v = variables.Variable(2, name="v") with session_lib.Session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess, initialized = sm2.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir, checkpoint_filename_with_path=checkpoint_filename_with_path) self.assertTrue(initialized) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEquals(1, sess.run(v))
def _test_recovered_variable(self, checkpoint_dir=None, checkpoint_filename_with_path=None): # Create a new Graph and SessionManager and recover from a checkpoint. with ops.Graph().as_default(): v = variables.VariableV1(2, name="v") with session_lib.Session(): self.assertEqual(False, variables.is_variable_initialized(v).eval()) sm2 = session_manager.SessionManager( ready_op=variables.report_uninitialized_variables()) saver = saver_lib.Saver({"v": v}) sess, initialized = sm2.recover_session( "", saver=saver, checkpoint_dir=checkpoint_dir, checkpoint_filename_with_path=checkpoint_filename_with_path) self.assertTrue(initialized) self.assertEqual( True, variables.is_variable_initialized( sess.graph.get_tensor_by_name("v:0")).eval(session=sess)) self.assertEqual(1, sess.run(v))
def get_session(is_chief): g = ops.Graph() with g.as_default(): with ops.device("/job:local"): v = variables.VariableV1( 1.0, name="ready_for_local_init_op_restore_v_" + str(uid)) vadd = v.assign_add(1) w = variables.VariableV1( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="ready_for_local_init_op_restore_w_" + str(uid)) ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) sv = supervisor.Supervisor( logdir=logdir, is_chief=is_chief, graph=g, recovery_wait_secs=1, ready_for_local_init_op=ready_for_local_init_op) sess = sv.prepare_or_wait_for_session(server.target) return sv, sess, v, vadd, w
def testZeroSizeVarInitialized(self): with ops.Graph().as_default(), self.cached_session() as sess: v = variables.Variable(array_ops.zeros([0, 2]), name="v") uninited = variables.report_uninitialized_variables() v.initializer.run() # not strictly necessary self.assertEqual(0, self.evaluate(uninited).size)
def _apply_model_average(self, lvars_and_gvars, name=None): """Apply local weights to global variables. This contains most of the synchronization implementation. Args: lvars_and_gvars: List of (local_vars, global_vars) pairs. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the lvars_and_gvars is empty. """ if not lvars_and_gvars: raise ValueError("Must supply at least one variable") train_ops = [] aggregated_lvars = [] model_reassign_ops = [] global_vars = [g for v, g in lvars_and_gvars if v is not None] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. with ops.colocate_with(local_anchor): self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=self._global_step.dtype.base_dtype, name="%s_local_step" % self._name) self.local_step_init_op = state_ops.assign(self._local_step, self._global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for lvar, gvar in lvars_and_gvars: lvar = ops.convert_to_tensor(lvar) with ops.device(gvar.device): # Dense variables. if lvar is None: aggregated_lvars.append(None) # pass-through. continue elif isinstance(lvar, ops.Tensor): lvar_accum = data_flow_ops.ConditionalAccumulator( lvar.dtype, shape=gvar.get_shape(), shared_name=gvar.name + "/lvar_accum") train_ops.append( lvar_accum.apply_grad(lvar, local_step=self._local_step)) aggregated_lvars.append( lvar_accum.take_grad(self._replicas_to_aggregate)) else: if not isinstance(lvar, ops.IndexedSlices): raise ValueError("Unknown model variable type!") lvar_accum = data_flow_ops.SparseConditionalAccumulator( lvar.dtype, shape=(), shared_name=gvar.name + "/model_variable_accum") train_ops.append( lvar_accum.apply_indexed_slices_grad( lvar, local_step=self._local_step)) aggregated_lvars.append( lvar_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((lvar_accum, gvar.device)) # sync_op will be assigned to the same device as the global step. with ops.device(self._global_step.device), ops.name_scope(""): for avg_var, gvar in zip(aggregated_lvars, global_vars): model_reassign_ops.append(state_ops.assign(gvar, avg_var)) model_reassign_ops.append( state_ops.assign_add(self._global_step, 1)) update_op = control_flow_ops.group(*(model_reassign_ops)) # Create token queue. with ops.device(self._global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, self._global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(self._global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], self._global_step) sync_op = sync_token_queue.enqueue_many((tokens, )) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(self._global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._average_applied = True return train_op
def testNoVars(self): with ops.Graph().as_default(), self.cached_session() as sess: uninited = variables.report_uninitialized_variables() self.assertEqual(0, self.evaluate(uninited).size)
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.tables_initializer()) if sync_optimizer is not None and isinstance( sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = sync_optimizer.chief_init_op else: local_init_op = sync_optimizer.local_step_init_op ready_for_local_init_op = sync_optimizer.ready_for_local_init_op else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: if not isinstance(sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def default_ready_for_local_init_op(): return variables.report_uninitialized_variables( variables.global_variables())
def default_ready_op(): return array_ops.concat([ variables.report_uninitialized_variables(), resources.report_uninitialized_resources() ], 0)
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] # local_anchor op will be placed on this worker task by default. local_anchor = control_flow_ops.no_op() # Colocating local_step variable prevents it being placed on the PS. distribution_strategy = ( distribution_strategy_context.get_distribution_strategy()) with distribution_strategy.colocate_vars_with(local_anchor): self._local_step = variable_scope.variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) with ops.device(var.device): # Dense gradients. if grad is None: aggregated_grad.append(None) # pass-through. continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_grad(grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_grad(self._replicas_to_aggregate)) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") train_ops.append( grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step)) aggregated_grad.append( grad_accum.take_indexed_slices_grad( self._replicas_to_aggregate)) self._accumulator_list.append((grad_accum, var.device)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) # Create token queue. with ops.device(global_step.device), ops.name_scope(""): sync_token_queue = (data_flow_ops.FIFOQueue( -1, global_step.dtype.base_dtype, shapes=(), name="sync_token_q", shared_name="sync_token_q")) self._sync_token_queue = sync_token_queue # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), name="dummy_queue", shared_name="dummy_queue")) with ops.device(global_step.device), ops.name_scope(""): # Replicas have to wait until they can get a token from the token queue. with ops.control_dependencies(train_ops): token = sync_token_queue.dequeue() train_op = state_ops.assign(self._local_step, token) with ops.control_dependencies([update_op]): # Sync_op needs to insert tokens to the token queue at the end of the # step so the replicas can fetch them to start the next step. tokens = array_ops.fill([self._tokens_per_step], global_step) sync_op = sync_token_queue.enqueue_many((tokens, )) if self._variable_averages is not None: with ops.control_dependencies([sync_op ]), ops.name_scope(""): sync_op = self._variable_averages.apply( self._variables_to_average) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) for accum, dev in self._accumulator_list: with ops.device(dev): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op