def testEvaluationLoopTimeout(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'evaluation_loop_timeout') if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) # We need a variable that that the saver will try to restore. variables.get_or_create_global_step() # Run with placeholders. If we actually try to evaluate this, we'd fail # since we're not using a feed_dict. cant_run_op = array_ops.placeholder(dtype=dtypes.float32) start = time.time() final_values = evaluation.evaluate_repeatedly( checkpoint_dir=checkpoint_dir, eval_ops=cant_run_op, hooks=[evaluation.StopAfterNEvalsHook(10)], timeout=6) end = time.time() self.assertFalse(final_values) # Assert that we've waited for the duration of the timeout (minus the sleep # time). self.assertGreater(end - start, 5.0) # Then the timeout kicked in and stops the loop. self.assertLess(end - start, 7)
def test_step_counter_every_n_steps(self): with ops.Graph().as_default() as g, session_lib.Session() as sess: variables.get_or_create_global_step() train_op = training_util._increment_global_step(1) summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir, g) hook = basic_session_run_hooks.StepCounterHook( summary_writer=summary_writer, every_n_steps=10) hook.begin() sess.run(variables_lib.global_variables_initializer()) mon_sess = monitored_session._HookedSession(sess, [hook]) with test.mock.patch.object(tf_logging, 'warning') as mock_log: for _ in range(30): time.sleep(0.01) mon_sess.run(train_op) # logging.warning should not be called. self.assertIsNone(mock_log.call_args) hook.end(sess) summary_writer.assert_summaries( test_case=self, expected_logdir=self.log_dir, expected_graph=g, expected_summaries={}) self.assertItemsEqual([11, 21], summary_writer.summaries.keys()) for step in [11, 21]: summary_value = summary_writer.summaries[step][0].value[0] self.assertEqual('global_step/sec', summary_value.tag) self.assertGreater(summary_value.simple_value, 0)
def test_two_listeners_with_default_saver(self): with ops.Graph().as_default(): global_step = variables.get_or_create_global_step() train_op = state_ops.assign_add(global_step, 1) listener1 = MockCheckpointSaverListener() listener2 = MockCheckpointSaverListener() hook = basic_session_run_hooks.CheckpointSaverHook( self.model_dir, save_steps=1, listeners=[listener1, listener2]) with monitored_session.SingularMonitoredSession( hooks=[hook], checkpoint_dir=self.model_dir) as sess: sess.run(train_op) sess.run(train_op) global_step_val = sess.run(global_step) listener1_counts = listener1.get_counts() listener2_counts = listener2.get_counts() self.assertEqual(2, global_step_val) self.assertEqual({ 'begin': 1, 'before_save': 2, 'after_save': 2, 'end': 1 }, listener1_counts) self.assertEqual(listener1_counts, listener2_counts) with ops.Graph().as_default(): global_step = variables.get_or_create_global_step() with monitored_session.SingularMonitoredSession( checkpoint_dir=self.model_dir) as sess2: global_step_saved_val = sess2.run(global_step) self.assertEqual(2, global_step_saved_val)
def test_step_counter_every_n_secs(self): with ops.Graph().as_default() as g, session_lib.Session() as sess: variables.get_or_create_global_step() train_op = training_util._increment_global_step(1) summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir, g) hook = basic_session_run_hooks.StepCounterHook( summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1) hook.begin() sess.run(variables_lib.global_variables_initializer()) mon_sess = monitored_session._HookedSession(sess, [hook]) mon_sess.run(train_op) time.sleep(0.2) mon_sess.run(train_op) time.sleep(0.2) mon_sess.run(train_op) hook.end(sess) summary_writer.assert_summaries( test_case=self, expected_logdir=self.log_dir, expected_graph=g, expected_summaries={}) self.assertTrue(summary_writer.summaries, 'No summaries were created.') self.assertItemsEqual([2, 3], summary_writer.summaries.keys()) for summary in summary_writer.summaries.values(): summary_value = summary[0].value[0] self.assertEqual('global_step/sec', summary_value.tag) self.assertGreater(summary_value.simple_value, 0)
def testEvaluateWithEvalFeedDict(self): # Create a checkpoint. checkpoint_dir = os.path.join(self.get_temp_dir(), 'evaluate_with_eval_feed_dict') self._train_model(checkpoint_dir, num_steps=1) # We need a variable that that the saver will try to restore. variables.get_or_create_global_step() # Create a variable and an eval op that increments it with a placeholder. my_var = variables.local_variable(0.0, name='my_var') increment = array_ops.placeholder(dtype=dtypes.float32) eval_ops = state_ops.assign_add(my_var, increment) increment_value = 3 num_evals = 5 expected_value = increment_value * num_evals final_values = evaluation.evaluate_repeatedly( checkpoint_dir=checkpoint_dir, eval_ops=eval_ops, feed_dict={increment: 3}, final_ops={'my_var': array_ops.identity(my_var)}, hooks=[evaluation.StopAfterNEvalsHook(num_evals),], max_number_of_evaluations=1) self.assertEqual(final_values['my_var'], expected_value)
def test_not_wait_for_step_zero(self): with ops.Graph().as_default(): variables.get_or_create_global_step() hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=0) hook.begin() with session_lib.Session() as sess: # Before run should return without waiting gstep increment. hook.before_run( session_run_hook.SessionRunContext( original_args=None, session=sess))
def setUp(self): test.TestCase.setUp(self) self.log_dir = 'log/dir' self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir) var = variables_lib.Variable(0.0) tensor = state_ops.assign_add(var, 1.0) tensor2 = tensor * 2 self.summary_op = summary_lib.scalar('my_summary', tensor) self.summary_op2 = summary_lib.scalar('my_summary2', tensor2) variables.get_or_create_global_step() self.train_op = training_util._increment_global_step(1)
def test_recover_and_retry_on_aborted_error(self): # Tests that we silently retry and recover on abort. This test uses # a CheckpointSaver to have something to recover from. logdir = _test_dir(self.get_temp_dir(), 'test_recover_and_retry_on_aborted_error') with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() do_step = state_ops.assign_add(gstep, 1) scaffold = monitored_session.Scaffold() abort_hook = RaiseOnceAtCountN( 4, errors_impl.AbortedError(None, None, 'Abort')) # Save after each step. ckpt_hook = basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, scaffold=scaffold) hooks = [abort_hook, ckpt_hook] with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_dir=logdir), hooks=hooks) as session: self.assertEqual(0, session.run(gstep)) self.assertEqual(1, session.run(do_step)) self.assertEqual(2, session.run(do_step)) self.assertFalse(session.should_stop()) # Here at step 3, the hook triggers and raises AbortedError. The # MonitoredSession automatically restores and retries. self.assertEqual(3, session.run(do_step)) self.assertTrue(abort_hook.raised) self.assertFalse(session.should_stop()) self.assertEqual(4, session.run(do_step)) self.assertFalse(session.should_stop())
def test_recovery(self): logdir = _test_dir(self.get_temp_dir(), 'test_recovery') with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() do_step = state_ops.assign_add(gstep, 1) scaffold = monitored_session.Scaffold() # Use a hook to save the model every 100 steps. It also saves it at # the end. hooks = [ basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=1, scaffold=scaffold) ] with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_dir=logdir), hooks=hooks) as session: self.assertEqual(0, session.run(gstep)) self.assertEqual(1, session.run(do_step)) self.assertEqual(2, session.run(do_step)) # A restart will find the checkpoint and recover automatically. with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_dir=logdir)) as session: self.assertEqual(2, session.run(gstep)) # A restart will find the checkpoint and recover automatically. with monitored_session.MonitoredSession( session_creator=monitored_session.ChiefSessionCreator( scaffold, checkpoint_filename_with_path=saver_lib.latest_checkpoint( logdir))) as session: self.assertEqual(2, session.run(gstep))
def test_num_steps(self): logdir = _test_dir(self.get_temp_dir(), 'test_num_steps') with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() do_step = state_ops.assign_add(gstep, 1) # Do 3 steps and save. hooks = [basic_session_run_hooks.StopAtStepHook(num_steps=3)] scaffold = monitored_session.Scaffold().finalize() with monitored_session.MonitoredSession(hooks=hooks) as session: session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertTrue(session.should_stop()) save_path = scaffold.saver.save(session._coordinated_creator.tf_sess, os.path.join(logdir, 'step-3')) # Restore and do 4 steps. def load_ckpt(scaffold, sess): scaffold.saver.restore(sess, save_path) session_creator = monitored_session.ChiefSessionCreator( scaffold=monitored_session.Scaffold(init_fn=load_ckpt)) hooks = [basic_session_run_hooks.StopAtStepHook(num_steps=4)] with monitored_session.MonitoredSession( hooks=hooks, session_creator=session_creator) as session: self.assertEqual(4, session.run(do_step)) self.assertFalse(session.should_stop()) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertFalse(session.should_stop()) session.run(do_step) self.assertTrue(session.should_stop())
def __init__( self, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, ready_op=None, local_init_op=None, summary_op=None, saver=None, keep_checkpoint_max=5, ): """Create a scaffold. Args: global_step_tensor: Optional tensor to use as the global step counter. init_op: Optional op for initializing variables. init_feed_dict: Optional session feed dictionary to use when running the init_op. init_fn: Optional function to use to initialize the model after running the init_op. Will be called as `init_fn(scaffold, session)`. ready_op: Optional op to verify that the variables are initialized. Must return an empty scalar string tensor when the variables are initialized, or a non-empty one listing the names of the non-initialized variables. local_init_op: Optional op to initialize local variables. summary_op: Optional op to gather all summaries. Must return a scalar string tensor containing a serialized `Summary` proto. saver: Optional `tf.Saver` object to use to save and restore variables. keep_checkpoint_max: Optional parameter to use to construct a saver if none is already there in the graph. """ if global_step_tensor is None: global_step_tensor = contrib_variables.get_or_create_global_step() self.global_step_tensor = global_step_tensor if init_op is None: init_op = Scaffold._get_or_default(ops.GraphKeys.INIT_OP, variables.initialize_all_variables) self.init_op = init_op self.init_feed_dict = init_feed_dict # NOTE(touts): modifying the init function to be passed the scaffold is a # hack to make it easy to find the saver. Is there a better way? if init_fn: self.init_fn = lambda sess: init_fn(self, sess) else: self.init_fn = None if ready_op is None: ready_op = Scaffold._get_or_default(ops.GraphKeys.READY_OP, variables.report_uninitialized_variables) self.ready_op = ready_op if local_init_op is None: local_init_op = Scaffold._get_or_default(ops.GraphKeys.LOCAL_INIT_OP, Scaffold._default_local_init_op) self.local_init_op = local_init_op if summary_op is None: summary_op = Scaffold._get_or_default(ops.GraphKeys.SUMMARY_OP, logging_ops.merge_all_summaries) # pylint: disable=g-long-lambda if saver is None: saver = Scaffold._get_or_default( ops.GraphKeys.SAVERS, lambda: training_saver.Saver(sharded=True, max_to_keep=keep_checkpoint_max) ) # pylint: enable=g-long-lambda self.saver = saver
def _get_train_ops(self, features, targets): """See base class.""" if not isinstance(self._linear_optimizer, sdca_optimizer.SDCAOptimizer): return super(LinearRegressor, self)._get_train_ops(features, targets) assert not self._joint_weights, ("_joint_weights is incompatible with" " SDCAOptimizer.") global_step = contrib_variables.get_or_create_global_step() logits, columns_to_variables, bias = ( layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=self._linear_feature_columns, num_outputs=self._target_column.num_label_columns, weight_collections=[self._linear_model.get_scope_name()], scope=self._linear_model.get_scope_name())) with ops.control_dependencies([self._centered_bias()]): loss = self._target_column.loss(logits, targets, features) logging_ops.scalar_summary("loss", loss) _add_bias_column(self._linear_feature_columns, features, bias, targets, columns_to_variables) train_op = self._linear_optimizer.get_train_step( columns_to_variables, self._target_column.weight_column_name, self._loss_type(), features, targets, global_step) return train_op, loss
def _get_train_ops(self, features, targets): """See base class.""" if not isinstance(self._linear_optimizer, sdca_optimizer.SDCAOptimizer): return super(LinearRegressor, self)._get_train_ops(features, targets) assert not self._joint_weights, ("_joint_weights is incompatible with" " SDCAOptimizer.") global_step = contrib_variables.get_or_create_global_step() logits, columns_to_variables, bias = ( layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=self._linear_feature_columns, num_outputs=self._head.logits_dimension, weight_collections=[self._linear_model.get_scope_name()], scope=self._linear_model.get_scope_name())) _add_bias_column(self._linear_feature_columns, features, bias, targets, columns_to_variables) def _train_op_fn(unused_loss): sdca_model, train_op = self._linear_optimizer.get_train_step( columns_to_variables, self._weight_column_name, self._loss_type(), features, targets, global_step) return sdca_model.update_weights(train_op) model_fn_ops = self._head.head_ops(features, targets, estimator.ModeKeys.TRAIN, _train_op_fn, logits=logits) return model_fn_ops.training_op, model_fn_ops.loss
def testReturnsSingleCheckpointIfOneShardedCheckpoint(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found_sharded') if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) global_step = variables.get_or_create_global_step() # This will result in 3 different checkpoint shard files. with ops.device('/cpu:0'): variables_lib.Variable(10, name='v0') with ops.device('/cpu:1'): variables_lib.Variable(20, name='v1') saver = saver_lib.Saver(sharded=True) with session_lib.Session( target='', config=config_pb2.ConfigProto(device_count={'CPU': 2})) as session: session.run(variables_lib.global_variables_initializer()) save_path = os.path.join(checkpoint_dir, 'model.ckpt') saver.save(session, save_path, global_step=global_step) num_found = 0 for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def setUp(self): self.model_dir = tempfile.mkdtemp() self.graph = ops.Graph() with self.graph.as_default(): self.scaffold = monitored_session.Scaffold() self.global_step = variables.get_or_create_global_step() self.train_op = state_ops.assign_add(self.global_step, 1)
def test_stop_based_on_num_step(self): h = basic_session_run_hooks.StopAtStepHook(num_steps=10) with ops.Graph().as_default(): global_step = variables.get_or_create_global_step() no_op = control_flow_ops.no_op() h.begin() with session_lib.Session() as sess: mon_sess = monitored_session._HookedSession(sess, [h]) sess.run(state_ops.assign(global_step, 5)) h.after_create_session(sess, None) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 13)) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 14)) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 15)) mon_sess.run(no_op) self.assertTrue(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 16)) mon_sess._should_stop = False mon_sess.run(no_op) self.assertTrue(mon_sess.should_stop())
def setUp(self): self.model_dir = tempfile.mkdtemp() self.graph = ops.Graph() with self.graph.as_default(): self.scaffold = monitored_session.Scaffold() self.global_step = variables.get_or_create_global_step() self.train_op = training_util._increment_global_step(1)
def test_listener_with_monitored_session(self): with ops.Graph().as_default(): scaffold = monitored_session.Scaffold() global_step = variables.get_or_create_global_step() train_op = state_ops.assign_add(global_step, 1) listener = MockCheckpointSaverListener() hook = basic_session_run_hooks.CheckpointSaverHook( self.model_dir, save_steps=1, scaffold=scaffold, listeners=[listener]) with monitored_session.SingularMonitoredSession( hooks=[hook], scaffold=scaffold, checkpoint_dir=self.model_dir) as sess: sess.run(train_op) sess.run(train_op) global_step_val = sess.run(global_step) listener_counts = listener.get_counts() self.assertEqual(2, global_step_val) self.assertEqual({ 'begin': 1, 'before_save': 2, 'after_save': 2, 'end': 1 }, listener_counts)
def testNoneGlobalStep(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = batchnorm_classifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op( total_loss, optimizer, global_step=None) global_step = variables_lib.get_or_create_global_step() with session_lib.Session() as sess: # Initialize all variables sess.run(variables_lib2.global_variables_initializer()) for _ in range(10): sess.run([train_op]) global_step = global_step.eval() # Since train_op don't use global_step it shouldn't change. self.assertAllClose(global_step, 0)
def finalize(self): """Creates operations if needed and finalizes the graph.""" if self._global_step_tensor is None: self._global_step_tensor = contrib_variables.get_or_create_global_step() if self._init_op is None: self._init_op = Scaffold._get_or_default( 'init_op', ops.GraphKeys.INIT_OP, variables.initialize_all_variables) if self._ready_op is None: self._ready_op = Scaffold._get_or_default( 'ready_op', ops.GraphKeys.READY_OP, variables.report_uninitialized_variables) if self._local_init_op is None: self._local_init_op = Scaffold._get_or_default( 'local_init_op', ops.GraphKeys.LOCAL_INIT_OP, Scaffold._default_local_init_op) if self._summary_op is None: self._summary_op = Scaffold._get_or_default( 'summary_op', ops.GraphKeys.SUMMARY_OP, logging_ops.merge_all_summaries) # pylint: disable=g-long-lambda if self._saver is None: self._saver = Scaffold._get_or_default( 'saver', ops.GraphKeys.SAVERS, lambda: training_saver.Saver(sharded=True, max_to_keep=self._keep_checkpoint_max)) # pylint: enable=g-long-lambda ops.get_default_graph().finalize()
def __init__(self, log_dir=None, summary_writer=None, summary_op=None, feed_dict=None): """Constructs the Summary Hook. Args: log_dir: The directory where the summary events are saved to. Used only when `summary_writer` is not specified. summary_writer: A `tf.summary.FileWriter` to write summary events with. summary_op: The summary op to run. If left as `None`, then all summaries in the tf.GraphKeys.SUMMARIES collection are used. feed_dict: An optional feed dictionary to use when evaluating the summaries. Raises: ValueError: If both `log_dir` and `summary_writer` are `None`. """ self._summary_op = summary_op self._feed_dict = feed_dict self._summary_writer = summary_writer self._log_dir = log_dir self._summary_writer = summary_writer if self._log_dir is None and self._summary_writer is None: raise ValueError('One of log_dir or summary_writer should be used.') self._global_step = variables.get_or_create_global_step()
def setUp(self): super(ProfilerHookTest, self).setUp() self.output_dir = tempfile.mkdtemp() self.graph = ops.Graph() self.filepattern = os.path.join(self.output_dir, 'timeline-*.json') with self.graph.as_default(): self.global_step = variables.get_or_create_global_step() self.train_op = state_ops.assign_add(self.global_step, 1)
def test_log_warning_if_global_step_not_increased(self): with ops.Graph().as_default(), session_lib.Session() as sess: variables.get_or_create_global_step() train_op = training_util._increment_global_step(0) # keep same. sess.run(variables_lib.global_variables_initializer()) hook = basic_session_run_hooks.StepCounterHook( every_n_steps=1, every_n_secs=None) hook.begin() mon_sess = monitored_session._HookedSession(sess, [hook]) mon_sess.run(train_op) # Run one step to record global step. with test.mock.patch.object(tf_logging, 'warning') as mock_log: for _ in range(30): mon_sess.run(train_op) self.assertRegexpMatches( str(mock_log.call_args), 'global step.*has not been increased') hook.end(sess)
def test_invalid_graph(self): # Create inputs. model_dir = tempfile.mkdtemp() hook = trainer_hooks.FeatureImportanceSummarySaver(model_dir) with ops.Graph().as_default(): # Begin won't be able to find the required tensors in the graph. _ = variables.get_or_create_global_step() with self.assertRaises(KeyError): hook.begin()
def setUp(self): super(EvaluationTest, self).setUp() num_classes = 8 batch_size = 16 inputs, labels = GenerateTestData(num_classes, batch_size) self._expected_accuracy = GroundTruthAccuracy(inputs, labels, batch_size) self._global_step = variables_lib.get_or_create_global_step() self._inputs = constant_op.constant(inputs, dtype=dtypes.float32) self._labels = constant_op.constant(labels, dtype=dtypes.int64) self._predictions, self._scale = TestModel(self._inputs)
def test_stop_cleanly_when_no_exception_in_with_body(self): # Tests that regular exceptions pass through with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() do_step = state_ops.assign_add(gstep, 1) session = monitored_session.MonitoredSession() with session: self.assertEqual(1, session.run(do_step)) self.assertEqual(2, session.run(do_step)) self.assertFalse(session.should_stop()) # Should have closed. self.assertTrue(session.should_stop()) self.assertTrue(session._is_closed())
def setUp(self): test.TestCase.setUp(self) self.log_dir = 'log/dir' self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir) var = variable_scope.get_variable('var', initializer=0.0, use_resource=True) tensor = state_ops.assign_add(var, 1.0) self.summary_op = summary_lib.scalar('my_summary', tensor) with variable_scope.variable_scope('foo', use_resource=True): global_step = variables.get_or_create_global_step() self.train_op = state_ops.assign_add(global_step, 1)
def test(self): """ Test the loaded model """ tf.logging.set_verbosity(tf.logging.INFO) logging.info("Checking Source-Target Network") # Create the global step for monitoring the learning_rate and training. global_step = get_or_create_global_step() # variable collection source_vars = collect_vars('source') target_vars = collect_vars('target') self.source_saver = tf.train.Saver(max_to_keep=None, var_list=source_vars.values()) self.target_saver = tf.train.Saver(max_to_keep=None, var_list=target_vars.values()) def restore_fn(sess): self.source_saver.restore( sess, F.source_checkpoint_dir + F.source_checkpoint_file) self.target_saver.restore( sess, F.target_checkpoint_dir + F.target_checkpoint_file) return self.test_handle_op = self.test_iter.string_handle() # Define your supervisor for running a managed session. if F.load_chkpt: sv = tf.train.Supervisor(logdir=F.log_eval_dir, summary_op=None, init_fn=restore_fn, saver=None) else: sv = tf.train.Supervisor(logdir=F.log_eval_dir, summary_op=None, init_fn=None, saver=None) current_best_loss = 1000. #TODO: Read it from a file for multiple restarts gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=F.gpu_frac) with sv.managed_session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: self.test_handle = sess.run(self.test_handle_op) eval_loss = [] while True: loss_source, loss_target = sess.run( [self.source_loss, self.target_loss], feed_dict={self.dataloader.split_handle: self.test_handle}) logging.info("Batch-Loss Source: {}, Target: {}".format( loss_source, loss_target))
def setUp(self): test.TestCase.setUp(self) self.log_dir = 'log/dir' self.summary_writer = fake_summary_writer.FakeSummaryWriter(self.log_dir) var = variables_lib.Variable(0.0) tensor = state_ops.assign_add(var, 1.0) tensor2 = tensor * 2 self.summary_op = summary_lib.scalar('my_summary', tensor) self.summary_op2 = summary_lib.scalar('my_summary2', tensor2) global_step = variables.get_or_create_global_step() self.train_op = state_ops.assign_add(global_step, 1)
def __init__(self, log_dir, summary_op=None, feed_dict=None): """Constructs the Summary Hook. Args: log_dir: The directory where the logs are saved to. summary_op: The summary op to run. If left as `None`, then all summaries in the tf.GraphKeys.SUMMARIES collection are used. feed_dict: An optional feed dictionary to use when evaluating the summaries. """ self._summary_op = summary_op self._feed_dict = feed_dict self._summary_writer = summary_io.SummaryWriter(log_dir) self._global_step = variables.get_or_create_global_step()
def test_summaries(self): logdir = _test_dir(self.get_temp_dir(), 'test_summaries') with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() new_gstep = state_ops.assign_add(gstep, 1) summary.scalar('my_summary_tag', new_gstep * 2) with monitored_session.MonitoredTrainingSession( is_chief=True, checkpoint_dir=logdir) as session: for _ in range(101): # 100 is default summary writing steps session.run(new_gstep) summaries = util_test.latest_summaries(logdir) tags = [s.summary.value[0].tag for s in summaries] self.assertIn('my_summary_tag', tags) self.assertIn('global_step/sec', tags)
def test_saving_restoring_checkpoint(self): logdir = _test_dir(self.get_temp_dir(), 'test_saving_restoring_checkpoint') with ops.Graph().as_default(): gstep = variables_lib.get_or_create_global_step() do_step = state_ops.assign_add(gstep, 1) with monitored_session.MonitoredTrainingSession( is_chief=True, checkpoint_dir=logdir) as session: self.assertEqual(0, session.run(gstep)) self.assertEqual(1, session.run(do_step)) self.assertEqual(2, session.run(do_step)) # A restart will find the checkpoint and recover automatically. with monitored_session.MonitoredTrainingSession( is_chief=True, checkpoint_dir=logdir) as session: self.assertEqual(2, session.run(gstep))
def train(): img = tf.placeholder(shape=[config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3], dtype=tf.float32) anchors_num = sum( [config.Config['feature_maps'][s] ** 2 * config.Config['aspect_num'][s] for s in range(6)]) loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) pred_loc, pred_confs, vbs = inceptionv3_500_ince.inception_v2_ssd(img,config) train_tensors = get_loss(conf, loc, pred_loc, pred_confs,config) global_step = get_or_create_global_step() # Define your exponentially decaying learning rate lr = tf.train.exponential_decay( learning_rate=0.001, global_step=global_step, decay_steps=20000, decay_rate=0.7, staircase=True) tf.summary.scalar('lr',lr) sum_op = tf.summary.merge_all() gen = data_gen.get_batch_inception(batch_size=config.batch_size,image_size=config.Config['min_dim'],max_detect=50) optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, '/home/dsl/all_check/inception_v3.ckpt') sv = tf.train.Supervisor(logdir='/home/dsl/all_check/face_detect/voc-v32', summary_op=None, init_fn=restore) with sv.managed_session() as sess: for step in range(1000000000): images, true_box, true_label = q.get() loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size,cfg=config.Config) feed_dict = {img: images, loc: loct, conf: conft} t = time.time() ls,step = sess.run([train_op,global_step], feed_dict=feed_dict) if step % 10 == 0: print(time.time()-t) summaries = sess.run(sum_op, feed_dict=feed_dict) sv.summary_computed(sess, summaries) print(ls)
def test_stop_based_with_multiple_steps(self): h = basic_session_run_hooks.StopAtStepHook(num_steps=10) with ops.Graph().as_default(): global_step = variables.get_or_create_global_step() no_op = control_flow_ops.no_op() h.begin() with session_lib.Session() as sess: mon_sess = monitored_session._HookedSession(sess, [h]) sess.run(state_ops.assign(global_step, 5)) h.after_create_session(sess, None) mon_sess.run(no_op) self.assertFalse(mon_sess.should_stop()) sess.run(state_ops.assign(global_step, 15)) mon_sess.run(no_op) self.assertTrue(mon_sess.should_stop())
def testReturnsSingleCheckpointIfOneCheckpointFound(self): checkpoint_dir = os.path.join(self.get_temp_dir(), 'one_checkpoint_found') if not gfile.Exists(checkpoint_dir): gfile.MakeDirs(checkpoint_dir) global_step = variables.get_or_create_global_step() saver = saver_lib.Saver() # Saves the global step. with self.test_session() as session: session.run(variables_lib.global_variables_initializer()) save_path = os.path.join(checkpoint_dir, 'model.ckpt') saver.save(session, save_path, global_step=global_step) num_found = 0 for _ in evaluation.checkpoints_iterator(checkpoint_dir, timeout=0): num_found += 1 self.assertEqual(num_found, 1)
def __init__(self, corpus, **opts): self.corpus = corpus self.opts = opts self.global_step = get_or_create_global_step() self.increment_global_step_op = tf.assign(self.global_step, self.global_step + 1, name="increment_global_step") self.corpus_size = get_corpus_size(self.corpus["train"]) self.corpus_size_valid = get_corpus_size(self.corpus["valid"]) self.word2idx, self.idx2word = build_vocab(self.corpus["train"]) self.vocab_size = len(self.word2idx) self.generator_template = tf.make_template(GENERATOR_PREFIX, generator) self.discriminator_template = tf.make_template(DISCRIMINATOR_PREFIX, discriminator) self.enqueue_data, _, source, target, sequence_length = \ prepare_data(self.corpus["train"], self.word2idx, num_threads=7, **self.opts) # TODO: option to either do pretrain or just generate? self.g_tensors_pretrain = self.generator_template( source, target, sequence_length, self.vocab_size, **self.opts) self.enqueue_data_valid, self.input_ph, source_valid, target_valid, sequence_length_valid = \ prepare_data(self.corpus["valid"], self.word2idx, num_threads=1, **self.opts) self.g_tensors_pretrain_valid = self.generator_template( source_valid, target_valid, sequence_length_valid, self.vocab_size, **self.opts) self.decoder_fn = prepare_custom_decoder(sequence_length) self.g_tensors_fake = self.generator_template( source, target, sequence_length, self.vocab_size, decoder_fn=self.decoder_fn, **self.opts) # TODO: using the rnn outputs from pretraining as "real" instead of target embeddings (aka professor forcing) self.d_tensors_real = self.discriminator_template( self.g_tensors_pretrain.rnn_outputs, sequence_length, is_real=True, **self.opts) # TODO: check to see if sequence_length is correct self.d_tensors_fake = self.discriminator_template( self.g_tensors_fake.rnn_outputs, None, is_real=False, **self.opts) self.g_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=GENERATOR_PREFIX) self.d_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=DISCRIMINATOR_PREFIX)
def test_run(self): # Create inputs. model_dir = tempfile.mkdtemp() hook = trainer_hooks.FeatureImportanceSummarySaver(model_dir) with ops.Graph().as_default(), tf_session.Session() as sess: global_step = variables.get_or_create_global_step() with ops.name_scope("gbdt"): constant_op.constant(["featA", "featB"], name="feature_names") constant_op.constant([0, 2], name="feature_usage_counts") constant_op.constant([0, 0.8], name="feature_gains") # Begin finds tensors in the graph. hook.begin() sess.run(tf_variables.global_variables_initializer()) # Run hook in a monitored session. train_op = state_ops.assign_add(global_step, 1) mon_sess = monitored_session._HookedSession(sess, [hook]) mon_sess.run(train_op) hook.end(sess) # Ensure output summary dirs are created. self.assertTrue(os.path.exists(os.path.join(model_dir, "featA"))) self.assertTrue(os.path.exists(os.path.join(model_dir, "featB")))
def test_wait_for_step(self): with ops.Graph().as_default(): gstep = variables.get_or_create_global_step() hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000) hook.begin() with session_lib.Session() as sess: sess.run(variables_lib.global_variables_initializer()) waiter = threading.Thread( target=hook.before_run, args=(session_run_hook.SessionRunContext( original_args=None, session=sess),)) waiter.daemon = True waiter.start() time.sleep(1.0) self.assertTrue(waiter.is_alive()) sess.run(state_ops.assign(gstep, 500)) time.sleep(1.0) self.assertTrue(waiter.is_alive()) sess.run(state_ops.assign(gstep, 1100)) time.sleep(1.2) self.assertFalse(waiter.is_alive())
def testGlobalStepNotIncrementedWhenSetToNone(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = batchnorm_classifier(tf_inputs) loss = losses.log_loss(tf_labels, tf_predictions) optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(loss, optimizer, global_step=None) global_step = variables_lib.get_or_create_global_step() with self.test_session() as session: # Initialize all variables session.run(variables_lib2.global_variables_initializer()) for _ in range(10): session.run(train_op) # Since train_op don't use global_step it shouldn't change. self.assertAllClose(global_step.eval(), 0)
def get_train_op(args, total_loss, learning_rate): # Generate moving averages of all losses and associated summaries. global_step = variables.get_or_create_global_step() loss_averages_op = _add_loss_summaries(total_loss) # Compute gradients. with tf.control_dependencies([loss_averages_op]): opt = get_optimizer(args, learning_rate) grads = opt.compute_gradients(total_loss, tf.global_variables()) # Apply gradients. apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( args.moving_average_decay, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) with tf.control_dependencies([apply_gradient_op, variables_averages_op]): train_op = tf.no_op(name='train') return train_op
def test_step_counter_every_n_steps(self): with ops.Graph().as_default() as g, session_lib.Session() as sess: global_step = variables.get_or_create_global_step() train_op = state_ops.assign_add(global_step, 1) summary_writer = fake_summary_writer.FakeSummaryWriter( self.log_dir, g) hook = basic_session_run_hooks.StepCounterHook( summary_writer=summary_writer, every_n_steps=10) hook.begin() sess.run(variables_lib.global_variables_initializer()) mon_sess = monitored_session._HookedSession(sess, [hook]) for _ in range(30): time.sleep(0.01) mon_sess.run(train_op) hook.end(sess) summary_writer.assert_summaries(test_case=self, expected_logdir=self.log_dir, expected_graph=g, expected_summaries={}) self.assertItemsEqual([11, 21], summary_writer.summaries.keys()) for step in [11, 21]: summary_value = summary_writer.summaries[step][0].value[0] self.assertEqual('global_step/sec', summary_value.tag) self.assertGreater(summary_value.simple_value, 0)
def testUseGlobalStep(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = batchnorm_classifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) global_step = variables_lib.get_or_create_global_step() with session_lib.Session() as sess: # Initialize all variables sess.run(variables_lib2.global_variables_initializer()) for _ in range(10): sess.run([train_op]) global_step = global_step.eval() # After 10 updates global_step should be 10. self.assertAllClose(global_step, 10)
def run(): with tf.Graph().as_default() as graph: tf.logging.set_verbosity(tf.logging.INFO) #===================TEST BRANCH======================= #Load the files into one input queue images = tf.convert_to_tensor(image_files) input_queue = tf.train.slice_input_producer([images], shuffle=False) #Decode the image and annotation raw content image = tf.read_file(input_queue[0]) image = tf.image.decode_image(image, channels=3) preprocessed_image = preprocess(image, None, image_height, image_width) images = tf.train.batch([preprocessed_image], batch_size=batch_size, allow_smaller_final_batch=True) #Create the model inference with slim.arg_scope(ENet_arg_scope()): logits, probabilities = ENet(images, num_classes, batch_size=batch_size, is_training=True, reuse=None, num_initial_blocks=num_initial_blocks, stage_two_repeat=stage_two_repeat, skip_connections=skip_connections) # Set up the variables to restore and restoring function from a saver. exclude = [] variables_to_restore = slim.get_variables_to_restore(exclude=exclude) saver = tf.train.Saver(variables_to_restore) def restore_fn(sess): return saver.restore(sess, checkpoint_file) #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. predictions = tf.argmax(probabilities, -1) #Create the global step and an increment op for monitoring global_step = get_or_create_global_step() global_step_op = tf.assign( global_step, global_step + 1 ) #no apply_gradient method so manually increasing the global_step #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir=logdir, summary_op=None, init_fn=restore_fn) #Run the managed session with sv.managed_session() as sess: #Save the images if save_images: if not os.path.exists(photo_dir): os.mkdir(photo_dir) for step in range(int(num_steps_per_epoch)): # Compute summaries every 10 steps and continue evaluating time_run = time.time() predictions_val = sess.run([predictions]) time_run_end = time.time() predictions_val_tuple = predictions_val[0] print('totally cost (second)', time_run_end - time_run) for i in range(predictions_val_tuple.shape[0]): predicted_annotation = predictions_val_tuple[i] # plt.subplot(1, 2, 1) plt.imshow(predicted_annotation) # plt.subplot(1, 2, 2) # plt.imshow(img) plt.savefig(photo_dir + "/image_" + str(image_files[step * num_epochs + i])[15:])
def __init__(self, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, ready_op=None, local_init_op=None, summary_op=None, saver=None, keep_checkpoint_max=5): """Create a scaffold. Args: global_step_tensor: Optional tensor to use as the global step counter. init_op: Optional op for initializing variables. init_feed_dict: Optional session feed dictionary to use when running the init_op. init_fn: Optional function to use to initialize the model after running the init_op. Will be called as `init_fn(scaffold, session)`. ready_op: Optional op to verify that the variables are initialized. Must return an empty scalar string tensor when the variables are initialized, or a non-empty one listing the names of the non-initialized variables. local_init_op: Optional op to initialize local variables. summary_op: Optional op to gather all summaries. Must return a scalar string tensor containing a serialized `Summary` proto. saver: Optional `tf.Saver` object to use to save and restore variables. keep_checkpoint_max: Optional parameter to use to construct a saver if none is already there in the graph. """ if global_step_tensor is None: global_step_tensor = contrib_variables.get_or_create_global_step() self.global_step_tensor = global_step_tensor if init_op is None: init_op = Scaffold._get_or_default( ops.GraphKeys.INIT_OP, variables.initialize_all_variables) self.init_op = init_op self.init_feed_dict = init_feed_dict # NOTE(touts): modifying the init function to be passed the scaffold is a # hack to make it easy to find the saver. Is there a better way? if init_fn: self.init_fn = lambda sess: init_fn(self, sess) else: self.init_fn = None if ready_op is None: ready_op = Scaffold._get_or_default( ops.GraphKeys.READY_OP, variables.report_uninitialized_variables) self.ready_op = ready_op if local_init_op is None: local_init_op = Scaffold._get_or_default( ops.GraphKeys.LOCAL_INIT_OP, Scaffold._default_local_init_op) self.local_init_op = local_init_op if summary_op is None: summary_op = Scaffold._get_or_default( ops.GraphKeys.SUMMARY_OP, logging_ops.merge_all_summaries) # pylint: disable=g-long-lambda if saver is None: saver = Scaffold._get_or_default( ops.GraphKeys.SAVERS, lambda: training_saver.Saver(sharded=True, max_to_keep=keep_checkpoint_max)) # pylint: enable=g-long-lambda self.saver = saver
def run(): #Create the log directory here. Must be done here otherwise import will activate this unneededly. if not os.path.exists(log_dir): os.mkdir(log_dir) check = 0 # parsing 생성되어 있는지 확인 #======================= TRAINING PROCESS ========================= #Now we start to construct the graph and build our model with tf.Graph().as_default() as graph: tf.logging.set_verbosity(tf.logging.INFO) #Set the verbosity to INFO level #First create the dataset and load one batch #dataset = get_split('train', dataset_dir, file_pattern=file_pattern) #images, _, labels = load_batch(dataset, batch_size=batch_size) ##here########################################################################################## parser = argparse.ArgumentParser() parser.add_argument("--data_path", default='../sample', help="data_path") parser.add_argument("--save_path", default='../preprocessed', help="data_path") args = parser.parse_args() if not os.path.exists(args.save_path): os.makedirs(args.save_path) if check == 0 : preprocesser = json_to_mel() abs_data_path = os.path.abspath(args.data_path) abs_save_path = os.path.abspath(args.save_path) json_path = os.path.join(abs_data_path, '*.json') save_clean_path = os.path.join(abs_save_path, 'clean.pkl') ## 바꾸기 2개로 #save_mel_path = os.path.join(abs_save_path, 'mel.pkl') clean_data_frame, clean_data_time = preprocesser.parser(preprocesser.read(json_path), save_clean_path) clean_data_mel, clean_data_label = preprocesser.split_by_emo(abs_data_path, abs_save_path, clean_data_frame, clean_data_time) with open('abc2.bin', 'rb') as file: data = pickle.load(file) ##################################################################################################### #Know the number steps to take before decaying the learning rate and batches per epoch num_batches_per_epoch = int(dataset.num_samples / batch_size) num_steps_per_epoch = num_batches_per_epoch #Because one step is one batch processed decay_steps = int(num_epochs_before_decay * num_steps_per_epoch) #Create the model inference with slim.arg_scope(inception_resnet_v2_arg_scope()): logits, end_points = inception_resnet_v2(images, num_classes = dataset.num_classes, is_training = True) #Define the scopes that you want to exclude for restoration exclude = ['InceptionResnetV2/Logits', 'InceptionResnetV2/AuxLogits'] variables_to_restore = slim.get_variables_to_restore(exclude = exclude) #Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!) one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes) #Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks loss = tf.losses.softmax_cross_entropy(onehot_labels = one_hot_labels, logits = logits) total_loss = tf.losses.get_total_loss() #obtain the regularization losses as well #Create the global step for monitoring the learning_rate and training. global_step = get_or_create_global_step() #Define your exponentially decaying learning rate lr = tf.train.exponential_decay( learning_rate = initial_learning_rate, global_step = global_step, decay_steps = decay_steps, decay_rate = learning_rate_decay_factor, staircase = True) #Now we can define the optimizer that takes on the learning rate optimizer = tf.train.AdamOptimizer(learning_rate = lr) #Create the train_op. train_op = slim.learning.create_train_op(total_loss, optimizer) # Accuracy predictions = tf.argmax(end_points['Predictions'], 1) probabilities = end_points['Predictions'] accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels) metrics_op = tf.group(accuracy_update, probabilities) #Summary's tf.summary.scalar('losses/Total_Loss', total_loss) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('learning_rate', lr) my_summary_op = tf.summary.merge_all() # training step function that runs both the train_op, metrics_op and updates the global_step concurrently. def train_step(sess, train_op, global_step): ''' Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step ''' #Check the time for each sess run start_time = time.time() total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op]) time_elapsed = time.time() - start_time #Run the logging to print some results logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed) return total_loss, global_step_count #Saver function that restores the variables from a checkpoint file in a sess saver = tf.train.Saver(variables_to_restore) def restore_fn(sess): return saver.restore(sess, checkpoint_file) #Define supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir = log_dir, summary_op = None, init_fn = restore_fn) #Run the managed session with sv.managed_session() as sess: for step in range(num_steps_per_epoch * num_epochs): #At the start of every epoch, show the vital information: if step % num_batches_per_epoch == 0: logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, num_epochs) learning_rate_value, accuracy_value = sess.run([lr, accuracy]) logging.info('Current Learning Rate: %s', learning_rate_value) logging.info('Current Streaming Accuracy: %s', accuracy_value) # optionally, print your logits and predictions for a sanity check that things are going fine. logits_value, probabilities_value, predictions_value, labels_value = sess.run([logits, probabilities, predictions, labels]) print('logits: \n', logits_value) print('Probabilities: \n', probabilities_value) print('predictions: \n', predictions_value) print('Labels:\n:', labels_value) #Log the summaries every 10 step. if step % 10 == 0: loss, _ = train_step(sess, train_op, sv.global_step) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) #If not, simply run the training step else: loss, _ = train_step(sess, train_op, sv.global_step) #We log the final training loss and accuracy logging.info('Final Loss: %s', loss) logging.info('Final Accuracy: %s', sess.run(accuracy)) #Once all the training has been done, save the log files and checkpoint model logging.info('Finished training! Saving model to disk now.') # saver.save(sess, "./flowers_model.ckpt") sv.saver.save(sess, sv.save_path, global_step = sv.global_step)
def train(split, train_steps, train_dir, fc_after, level, checkpoints_dir = checkpoints_dir,checkpoint = 'model.ckpt-150000'): """ Trains the given neural network and saves the weights and summary information into a new checkpoint file in the train_dir Args: split: Chooses split of flower dataset to train the network train_steps: Number of steps to train network train_dir: Directory in which checkpoints should be stored, and old checkpoints get loaded Returns: - """ with tf.Graph().as_default(): tf.logging.set_verbosity(tf.logging.INFO) # showing INFO logs dataset = dataVisualisation.get_split(split_name = split, dataset_dir = flowers_data_dir, label_type="multiple") images, _, label_species, labels_genus, labels_family, labels_organ= load_batch_intermediate(dataset,height=224, width=224, is_training=True, batch_size=100) #print(family, genus, species) abstraction_levels = {"family":labels_family, "genus":labels_genus, "species":label_species, "organs":labels_organ} levels_length = {"family":124, "genus":516, "species":1000, "organs":7} labels = tf.stack(abstraction_levels.get(level, label_species)) one_hot_labels = slim.one_hot_encoding(labels, levels_length.get(level, 1000)) # Forward pass with non-flipped images logits,_ = my_intermediate_cnn(images, is_training=True, fc_after=fc_after, num_classes = levels_length.get(level, 1000)) #print(logits, one_hot_labels) #with tf.Session() as sess: # print(sess.run(tf.shape(logits))) tf.losses.softmax_cross_entropy(one_hot_labels, logits) total_loss = tf.losses.get_total_loss() tf.summary.scalar('losses/Total_Loss', total_loss) # Learning rate decay global_step = variables.get_or_create_global_step() boundaries = [tf.constant(100000, dtype= "int64"), tf.constant(200000, dtype= "int64"), tf.constant(300000, dtype= "int64")] values = [0.001, 0.0001, 0.00001, 0.000001] my_learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) #for v in tf.trainable_variables(): # print(v) #for v in slim.get_variables(scope="resnet_v2_50/fc_intermediate/"): # print(v) # Specify the optimizer and create the train op: optimizer = tf.train.MomentumOptimizer(learning_rate=my_learning_rate, momentum = 0.9) train_op = slim.learning.create_train_op(total_loss=total_loss, optimizer=optimizer, variables_to_train=slim.get_variables(scope="fc_intermediate")) saver = tf.train.Saver(max_to_keep=1) # Run the training: final_loss = slim.learning.train( train_op, logdir=train_dir, log_every_n_steps=50, init_fn= get_init_fn(fc_after), number_of_steps=train_steps, global_step = global_step, saver = saver) print('Finished training. Last batch loss %f' % final_loss)
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.tables_initializer()) if sync_optimizer is not None and isinstance( sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = sync_optimizer.chief_init_op else: local_init_op = sync_optimizer.local_step_init_op ready_for_local_init_op = sync_optimizer.ready_for_local_init_op else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: if not isinstance(sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def run(): end_points = {} if not os.path.exists(log_eval): os.mkdir(log_eval) with tf.Graph().as_default() as graph: tf.logging.set_verbosity( tf.logging.INFO) # Set the verbosity to INFO level ######################################################## # Get RGB dataset and the Imagenet trained on RGB images ######################################################## # First create the dataset and load one batch dataset_rgb = get_split('validation', dataset_dir_rgb, file_pattern=file_pattern) images_rgb, labels_rgb = load_batch(dataset_rgb, batch_size=batch_size) # Know the number steps to take before decaying the learning rate and batches per epoch num_batches_per_epoch = int(dataset_rgb.num_samples / batch_size) num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed with tf.variable_scope("net_rgb"): # Create the model inference with slim.arg_scope(inception_v3_arg_scope()): logits_rgb, end_points_rgb = inception_v3( images_rgb, num_classes=dataset_rgb.num_classes, is_training=True) ######################################################## # Get depth dataset and the Imagenet trained on depth images ######################################################## # First create the dataset and load one batch dataset_depth = get_split('validation', dataset_dir_depth, file_pattern=file_pattern) images_depth, labels_depth = load_batch(dataset_depth, batch_size=batch_size) # Create the model inference with tf.variable_scope("net_depth"): with slim.arg_scope(inception_v3_arg_scope()): logits_depth, end_points_depth = inception_v3( images_depth, num_classes=dataset_rgb.num_classes, is_training=True) ######################################################## # Combine the models with the concatenation operation # and add an FC layer on top ######################################################## # with tf.variable_scope("concat_dense"): W_master = tf.Variable(tf.random_uniform([10, 5], -0.01, 0.01), name="weights_concat") b_master = tf.Variable(tf.zeros([5]), name="bias_concat") h_master = tf.matmul(tf.concat( (logits_rgb, logits_depth), axis=1), W_master) + b_master logits2 = tf.layers.dense(inputs=h_master, units=(num_classes * 2), name="dense_concat1") logits = tf.layers.dense(inputs=logits2, units=num_classes, name="dense_concat0") end_points['Logits'] = logits end_points['Predictions'] = slim.softmax(logits, scope='Predictions') variables_to_restore = slim.get_variables_to_restore() saver = tf.train.Saver(variables_to_restore) def restore_fn(sess): return saver.restore(sess, checkpoint_file) #################################################### # EVALUATION #################################################### predictions = tf.argmax(end_points['Predictions'], 1) accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( predictions, labels_rgb) metrics_op = tf.group(accuracy_update) global_step = get_or_create_global_step() global_step_op = tf.assign(global_step, global_step + 1) conf_m = np.zeros((5, 5)) def eval_step(sess, metrics_op, global_step, confusion_m): ''' Simply takes in a session, runs the metrics op and some logging information. ''' start_time = time.time() _, global_step_count, accuracy_value = sess.run( [metrics_op, global_step_op, accuracy]) time_elapsed = time.time() - start_time images_rgb_im, images_depth_im, labels, prediction = sess.run( [images_rgb, images_depth, labels_rgb, predictions]) confusion_m += confusion_matrix(labels, prediction, labels=[0, 1, 2, 3, 4]) logging.info( 'Global Step %s: Streaming Accuracy: %.4f (%.2f sec/step)', global_step_count, accuracy_value, time_elapsed) return accuracy_value tf.summary.scalar('Validation_Accuracy', accuracy) my_summary_op = tf.summary.merge_all() sv = tf.train.Supervisor(logdir=log_eval, summary_op=None, saver=None, init_fn=restore_fn) with sv.managed_session() as sess: num_steps_per_epoch = int(num_steps_per_epoch) for step in range(num_steps_per_epoch * num_epochs): sess.run(sv.global_step) if step % num_batches_per_epoch == 0: logging.info('Epoch: %s/%s', step / num_batches_per_epoch + 1, num_epochs) logging.info('Current Streaming Accuracy: %.4f', sess.run(accuracy)) if step % 10 == 0: eval_step(sess, metrics_op=metrics_op, global_step=sv.global_step, confusion_m=conf_m) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) else: eval_step(sess, metrics_op=metrics_op, global_step=sv.global_step, confusion_m=conf_m) logging.info('Final Streaming Accuracy: %.4f', sess.run(accuracy)) images_rgb, images_depth, labels, predictions = sess.run( [images_rgb, images_depth, labels_rgb, predictions]) print(sess.run(end_points['Predictions'])) print(conf_m) for i in range(10): label, prediction = labels[i], predictions[i] prediction_name, label_name = dataset_rgb.labels_to_name[ prediction], dataset_rgb.labels_to_name[label] text = 'Prediction: %s \n Ground Truth: %s' % (prediction_name, label_name) print(text) logging.info( 'Model evaluation has completed! Visit TensorBoard for more information regarding your evaluation.' )
def run(): with tf.Graph().as_default() as graph: tf.logging.set_verbosity(tf.logging.INFO) #===================TEST BRANCH======================= #Load the files into one input queue images = tf.convert_to_tensor(image_files) annotations = tf.convert_to_tensor(annotation_files) input_queue = tf.train.slice_input_producer([images, annotations]) #Decode the image and annotation raw content image = tf.read_file(input_queue[0]) image = tf.image.decode_image(image, channels=3) annotation = tf.read_file(input_queue[1]) annotation = tf.image.decode_image(annotation) #preprocess and batch up the image and annotation preprocessed_image, preprocessed_annotation = preprocess( image, annotation, image_height, image_width) images, annotations = tf.train.batch( [preprocessed_image, preprocessed_annotation], batch_size=batch_size, allow_smaller_final_batch=True) #Create the model inference with slim.arg_scope(ENet_arg_scope()): logits, probabilities = ENet(images, num_classes, batch_size=batch_size, is_training=True, reuse=None, num_initial_blocks=num_initial_blocks, stage_two_repeat=stage_two_repeat, skip_connections=skip_connections) # Set up the variables to restore and restoring function from a saver. exclude = [] variables_to_restore = slim.get_variables_to_restore(exclude=exclude) saver = tf.train.Saver(variables_to_restore) def restore_fn(sess): return saver.restore(sess, checkpoint_file) #perform one-hot-encoding on the ground truth annotation to get same shape as the logits annotations = tf.reshape(annotations, shape=[batch_size, image_height, image_width]) annotations_ohe = tf.one_hot(annotations, num_classes, axis=-1) annotations = tf.cast(annotations, tf.int64) #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. predictions = tf.argmax(probabilities, -1) accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( predictions, annotations) mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou( predictions=predictions, labels=annotations, num_classes=num_classes) per_class_accuracy, per_class_accuracy_update = tf.metrics.mean_per_class_accuracy( labels=annotations, predictions=predictions, num_classes=num_classes) metrics_op = tf.group(accuracy_update, mean_IOU_update, per_class_accuracy_update) #Create the global step and an increment op for monitoring global_step = get_or_create_global_step() global_step_op = tf.assign( global_step, global_step + 1 ) #no apply_gradient method so manually increasing the global_step #Create a evaluation step function def eval_step(sess, metrics_op, global_step): ''' Simply takes in a session, runs the metrics op and some logging information. ''' start_time = time.time() _, global_step_count, accuracy_value, mean_IOU_value, per_class_accuracy_value = sess.run( [ metrics_op, global_step_op, accuracy, mean_IOU, per_class_accuracy ]) time_elapsed = time.time() - start_time #Log some information logging.info( 'Global Step %s: Streaming Accuracy: %.4f Streaming Mean IOU: %.4f Per-class Accuracy: %.4f (%.2f sec/step)', global_step_count, accuracy_value, mean_IOU_value, per_class_accuracy_value, time_elapsed) return accuracy_value, mean_IOU_value, per_class_accuracy_value #Create your summaries tf.summary.scalar('Monitor/test_accuracy', accuracy) tf.summary.scalar('Monitor/test_mean_per_class_accuracy', per_class_accuracy) tf.summary.scalar('Monitor/test_mean_IOU', mean_IOU) my_summary_op = tf.summary.merge_all() #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir=logdir, summary_op=None, init_fn=restore_fn) #Run the managed session with sv.managed_session() as sess: for step in range(int(num_steps_per_epoch * num_epochs)): #print vital information every start of the epoch as always if step % num_batches_per_epoch == 0: accuracy_value, mean_IOU_value = sess.run( [accuracy, mean_IOU]) logging.info('Epoch: %s/%s', step / num_batches_per_epoch + 1, num_epochs) logging.info('Current Streaming Accuracy: %.4f', accuracy_value) logging.info('Current Streaming Mean IOU: %.4f', mean_IOU_value) #Compute summaries every 10 steps and continue evaluating if step % 10 == 0: test_accuracy, test_mean_IOU, test_per_class_accuracy = eval_step( sess, metrics_op=metrics_op, global_step=sv.global_step) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) #Otherwise just run as per normal else: test_accuracy, test_mean_IOU, test_per_class_accuracy = eval_step( sess, metrics_op=metrics_op, global_step=sv.global_step) #At the end of all the evaluation, show the final accuracy logging.info('Final Streaming Accuracy: %.4f', test_accuracy) logging.info('Final Mean IOU: %.4f', test_mean_IOU) logging.info('Final Per Class Accuracy %.4f', test_per_class_accuracy) #Show end of evaluation logging.info('Finished evaluating!') #Save the images if save_images: if not os.path.exists(photo_dir): os.mkdir(photo_dir) #Save the image visualizations for the first 10 images. logging.info('Saving the images now...') predictions_val, annotations_val = sess.run( [predictions, annotations]) for i in range(10): predicted_annotation = predictions_val[i] annotation = annotations_val[i] plt.subplot(1, 2, 1) plt.imshow(predicted_annotation) plt.subplot(1, 2, 2) plt.imshow(annotation) plt.savefig(photo_dir + "/image_" + str(i))
def run(): # Create the log directory here. Must be done here otherwise import will activate this unneededly. # 创建log目录 if not os.path.exists(log_dir): os.mkdir(log_dir) # ======================= TRAINING PROCESS(训练过程) ========================= # Now we start to construct the graph and build our model # 现在我们开始构造图并建立我们的模型 with tf.Graph().as_default() as graph: # Set the verbosity to INFO level # 设置日志的级别,会将日志级别为INFO的打印出 tf.logging.set_verbosity(tf.logging.INFO) # First create the dataset and load one batch # 首先,创建数据集并加载一个批次 dataset = get_split('train', dataset_dir, file_pattern=file_pattern) images, _, labels = load_batch(dataset, batch_size=batch_size) # Know the number steps to take before decaying the learning rate and batches per epoch num_batches_per_epoch = dataset.num_samples // batch_size num_steps_per_epoch = num_batches_per_epoch # Because one step is one batch processed decay_steps = int(num_epochs_before_decay * num_steps_per_epoch) # Create the model inference # 创建模型推理 with slim.arg_scope(xception_arg_scope()): logits, end_points = xception(images, num_classes=dataset.num_classes, is_training=True) # Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!) # 将标签编程one-hot形式 one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes) # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks # 计算损失 loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits) total_loss = tf.losses.get_total_loss() # obtain the regularization losses as well # Create the global step for monitoring the learning_rate and training. # 创建global_step global_step = get_or_create_global_step() # Define your exponentially decaying learning rate # 定义指数衰减的学习率 lr = tf.train.exponential_decay( learning_rate=initial_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=learning_rate_decay_factor, staircase=True) # Now we can define the optimizer that takes on the learning rate # 定义优化器 optimizer = tf.train.AdamOptimizer(learning_rate=lr) # optimizer = tf.train.RMSPropOptimizer(learning_rate = lr, momentum=0.9) # Create the train_op. # 创建训练操作 train_op = slim.learning.create_train_op(total_loss, optimizer) # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. # 定义度量标准 predictions = tf.argmax(end_points['Predictions'], 1) probabilities = end_points['Predictions'] accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(predictions, labels) metrics_op = tf.group(accuracy_update, probabilities) # Now finally create all the summaries you need to monitor and group them into one summary op. # 创建summary tf.summary.scalar('losses/Total_Loss', total_loss) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('learning_rate', lr) my_summary_op = tf.summary.merge_all() # Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently. def train_step(sess, train_op, global_step): ''' Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step ''' # Check the time for each sess run start_time = time.time() total_loss, global_step_count, _ = sess.run([train_op, global_step, metrics_op]) time_elapsed = time.time() - start_time # Run the logging to print some results logging.info('global step %s: loss: %.4f (%.2f sec/step)', global_step_count, total_loss, time_elapsed) return total_loss, global_step_count # Define your supervisor for running a managed session. # Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir=log_dir, summary_op=None) # Run the managed session with sv.managed_session() as sess: for step in range(num_steps_per_epoch * num_epochs): # At the start of every epoch, show the vital information: if step % num_batches_per_epoch == 0: logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs) learning_rate_value, accuracy_value = sess.run([lr, accuracy]) logging.info('Current Learning Rate: %s', learning_rate_value) logging.info('Current Streaming Accuracy: %s', accuracy_value) # optionally, print your logits and predictions for a sanity check that things are going fine. logits_value, probabilities_value, predictions_value, labels_value = sess.run( [logits, probabilities, predictions, labels]) print('logits: \n', logits_value[:5]) print('Probabilities: \n', probabilities_value[:5]) print('predictions: \n', predictions_value[:5]) print('Labels:\n:', labels_value[:5]) # Log the summaries every 10 step. if step % 10 == 0: loss, _ = train_step(sess, train_op, sv.global_step) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) # If not, simply run the training step else: loss, _ = train_step(sess, train_op, sv.global_step) # We log the final training loss and accuracy logging.info('Final Loss: %s', loss) logging.info('Final Accuracy: %s', sess.run(accuracy)) # Once all the training has been done, save the log files and checkpoint model logging.info('Finished training! Saving model to disk now.')
def _build_graph(self): with tf.Graph().as_default() as graph: tf.logging.set_verbosity( tf.logging.INFO) # Set the verbosity to INFO level # #First create the dataset and load one batch def load_batch_from_tfrecord( self, split_name, dataset_dir=self.tfrecord_dir, num_classes=self.num_classes, file_pattern_for_counting=self.tfrecord_prefix, batch_size=self.batch_size): is_training = True if split_name == 'train' else False file_pattern = self.tfrecord_prefix + '_%s_*.tfrecord' dataset = get_split(split_name, dataset_dir, num_classes, file_pattern, file_pattern_for_counting) images, _, labels = load_batch(dataset, batch_size, num_classes, height=self.image_size, width=self.image_size, is_training=is_training) return images, labels, dataset.num_samples ## get train data train_images, self.train_labels, self.num_samples = load_batch_from_tfrecord( self, 'train') ## get validation data val_images, self.val_labels, self.val_num_samples = load_batch_from_tfrecord( self, 'validation') # #Know the number steps to take before decaying the learning rate and batches per epoch self.num_batches_per_epoch = (self.num_samples - 1) / self.batch_size + 1 self.val_num_batches_per_epoch = (self.val_num_samples - 1) / self.batch_size + 1 with slim.arg_scope(inception_resnet_v2_arg_scope()): logits, end_points = inception_resnet_v2( train_images, num_classes=self.num_classes, is_training=True) ## convert into probabilities self.probabilities = tf.sigmoid(logits) ## new loss, just equal to the sum of 14 log loss loss = tf.losses.log_loss(labels=self.train_labels, predictions=self.probabilities) # total_loss = tf.losses.get_total_loss() # obtain the regularization losses as well l2_loss = tf.add_n( [tf.nn.l2_loss(var) for var in tf.trainable_variables()]) total_loss = loss + l2_loss * self.weight_decay ## convert into actual predicte lesion_pred = tf.cast(tf.greater_equal(self.probabilities, 0.5), tf.float32) # Create the global step for monitoring the learning_rate and training. self.global_step = get_or_create_global_step() decay_steps = int(self.step_size * self.num_batches_per_epoch) # Define your exponentially decaying learning rate self.lr = tf.train.exponential_decay( learning_rate=self.learning_rate, global_step=self.global_step, decay_steps=decay_steps, decay_rate=self.lr_decay_factor, staircase=True) # Now we can define the optimizer that takes on the learning rate optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) # Create the train_op. self.train_op = slim.learning.create_train_op( total_loss, optimizer) # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. self.accuracy = tf.reduce_mean( tf.cast(tf.equal(lesion_pred, self.train_labels), tf.float32)) # def val_graph(images, labels): with slim.arg_scope(inception_resnet_v2_arg_scope()): val_logits, val_end_points = inception_resnet_v2( val_images, num_classes=self.num_classes, is_training=False, reuse=True) self.val_probabilities = tf.sigmoid(val_logits) ## new loss, just equal to the sum of 14 log loss self.val_loss = tf.losses.log_loss( labels=self.val_labels, predictions=self.val_probabilities) val_lesion_pred = tf.cast( tf.greater_equal(self.val_probabilities, 0.5), tf.float32) self.val_accuracy = tf.reduce_mean( tf.cast(tf.equal(val_lesion_pred, self.val_labels), tf.float32)) # Now finally create all the summaries you need to monitor and group them into one summary op. tf.summary.scalar('losses/Total_Loss', total_loss) tf.summary.scalar('accuracy', self.accuracy) # tf.summary.scalar('auc', auc) tf.summary.scalar('learning_rate', self.lr) tf.summary.scalar('val_losses', self.val_loss) tf.summary.scalar('val_accuracy', self.val_accuracy) self.my_summary_op = tf.summary.merge_all()
def run(): with tf.Graph().as_default() as graph: tf.logging.set_verbosity(tf.logging.INFO) #===================TRAINING BRANCH======================= #Load the files into one input queue images = tf.convert_to_tensor(image_files) annotations = tf.convert_to_tensor(annotation_files) input_queue = tf.train.slice_input_producer( [images, annotations]) #Slice_input producer shuffles the data by default. #Decode the image and annotation raw content image = tf.read_file(input_queue[0]) image = tf.image.decode_image(image, channels=3) annotation = tf.read_file(input_queue[1]) annotation = tf.image.decode_image(annotation) #preprocess and batch up the image and annotation preprocessed_image, preprocessed_annotation = preprocess( image, annotation, image_height, image_width) images, annotations = tf.train.batch( [preprocessed_image, preprocessed_annotation], batch_size=batch_size, allow_smaller_final_batch=True) #Create the model inference with slim.arg_scope(ENet_arg_scope(weight_decay=weight_decay)): logits, probabilities = ENet(images, num_classes, batch_size=batch_size, is_training=True, reuse=None, num_initial_blocks=num_initial_blocks, stage_two_repeat=stage_two_repeat, skip_connections=skip_connections) #perform one-hot-encoding on the ground truth annotation to get same shape as the logits annotations = tf.reshape(annotations, shape=[batch_size, image_height, image_width]) annotations_ohe = tf.one_hot(annotations, num_classes, axis=-1) #Actually compute the loss loss = weighted_cross_entropy(logits=logits, onehot_labels=annotations_ohe, class_weights=class_weights) total_loss = tf.losses.get_total_loss() #Create the global step for monitoring the learning_rate and training. global_step = get_or_create_global_step() #Define your exponentially decaying learning rate lr = tf.train.exponential_decay(learning_rate=initial_learning_rate, global_step=global_step, decay_steps=decay_steps, decay_rate=learning_rate_decay_factor, staircase=True) #Now we can define the optimizer that takes on the learning rate optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=epsilon) #Create the train_op. train_op = slim.learning.create_train_op(total_loss, optimizer) #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. predictions = tf.argmax(probabilities, -1) accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( predictions, annotations) mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou( predictions=predictions, labels=annotations, num_classes=num_classes) metrics_op = tf.group(accuracy_update, mean_IOU_update) #Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently. def train_step(sess, train_op, global_step, metrics_op): ''' Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step ''' #Check the time for each sess run start_time = time.time() total_loss, global_step_count, accuracy_val, mean_IOU_val, _ = sess.run( [train_op, global_step, accuracy, mean_IOU, metrics_op]) time_elapsed = time.time() - start_time #Run the logging to show some results logging.info( 'global step %s: loss: %.4f (%.2f sec/step) Current Streaming Accuracy: %.4f Current Mean IOU: %.4f', global_step_count, total_loss, time_elapsed, accuracy_val, mean_IOU_val) return total_loss, accuracy_val, mean_IOU_val #================VALIDATION BRANCH======================== #Load the files into one input queue images_val = tf.convert_to_tensor(image_val_files) annotations_val = tf.convert_to_tensor(annotation_val_files) input_queue_val = tf.train.slice_input_producer( [images_val, annotations_val]) #Decode the image and annotation raw content image_val = tf.read_file(input_queue_val[0]) image_val = tf.image.decode_jpeg(image_val, channels=3) annotation_val = tf.read_file(input_queue_val[1]) annotation_val = tf.image.decode_png(annotation_val) #preprocess and batch up the image and annotation preprocessed_image_val, preprocessed_annotation_val = preprocess( image_val, annotation_val, image_height, image_width) images_val, annotations_val = tf.train.batch( [preprocessed_image_val, preprocessed_annotation_val], batch_size=eval_batch_size, allow_smaller_final_batch=True) with slim.arg_scope(ENet_arg_scope(weight_decay=weight_decay)): logits_val, probabilities_val = ENet( images_val, num_classes, batch_size=eval_batch_size, is_training=True, reuse=True, num_initial_blocks=num_initial_blocks, stage_two_repeat=stage_two_repeat, skip_connections=skip_connections) #perform one-hot-encoding on the ground truth annotation to get same shape as the logits annotations_val = tf.reshape( annotations_val, shape=[eval_batch_size, image_height, image_width]) annotations_ohe_val = tf.one_hot(annotations_val, num_classes, axis=-1) #State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. ----> Should we use OHE instead? predictions_val = tf.argmax(probabilities_val, -1) accuracy_val, accuracy_val_update = tf.contrib.metrics.streaming_accuracy( predictions_val, annotations_val) mean_IOU_val, mean_IOU_val_update = tf.contrib.metrics.streaming_mean_iou( predictions=predictions_val, labels=annotations_val, num_classes=num_classes) metrics_op_val = tf.group(accuracy_val_update, mean_IOU_val_update) #Create an output for showing the segmentation output of validation images segmentation_output_val = tf.cast(predictions_val, dtype=tf.float32) segmentation_output_val = tf.reshape( segmentation_output_val, shape=[-1, image_height, image_width, 1]) segmentation_ground_truth_val = tf.cast(annotations_val, dtype=tf.float32) segmentation_ground_truth_val = tf.reshape( segmentation_ground_truth_val, shape=[-1, image_height, image_width, 1]) def eval_step(sess, metrics_op): ''' Simply takes in a session, runs the metrics op and some logging information. ''' start_time = time.time() _, accuracy_value, mean_IOU_value = sess.run( [metrics_op, accuracy_val, mean_IOU_val]) time_elapsed = time.time() - start_time #Log some information logging.info( '---VALIDATION--- Validation Accuracy: %.4f Validation Mean IOU: %.4f (%.2f sec/step)', accuracy_value, mean_IOU_value, time_elapsed) return accuracy_value, mean_IOU_value #===================================================== #Now finally create all the summaries you need to monitor and group them into one summary op. tf.summary.scalar('Monitor/Total_Loss', total_loss) tf.summary.scalar('Monitor/validation_accuracy', accuracy_val) tf.summary.scalar('Monitor/training_accuracy', accuracy) tf.summary.scalar('Monitor/validation_mean_IOU', mean_IOU_val) tf.summary.scalar('Monitor/training_mean_IOU', mean_IOU) tf.summary.scalar('Monitor/learning_rate', lr) tf.summary.image('Images/Validation_original_image', images_val, max_outputs=1) tf.summary.image('Images/Validation_segmentation_output', segmentation_output_val, max_outputs=1) tf.summary.image('Images/Validation_segmentation_ground_truth', segmentation_ground_truth_val, max_outputs=1) my_summary_op = tf.summary.merge_all() #Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir=logdir, summary_op=None, init_fn=None) # Run the managed session with sv.managed_session() as sess: for step in xrange(int(num_steps_per_epoch * num_epochs)): #At the start of every epoch, show the vital information: if step % num_batches_per_epoch == 0: logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, num_epochs) learning_rate_value = sess.run([lr]) logging.info('Current Learning Rate: %s', learning_rate_value) #Log the summaries every 10 steps or every end of epoch, which ever lower. if step % min(num_steps_per_epoch, 10) == 0: loss, training_accuracy, training_mean_IOU = train_step( sess, train_op, sv.global_step, metrics_op=metrics_op) #Check the validation data only at every third of an epoch if step % (num_steps_per_epoch / 3) == 0: for i in xrange( len(image_val_files) / eval_batch_size): validation_accuracy, validation_mean_IOU = eval_step( sess, metrics_op_val) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) #If not, simply run the training step else: loss, training_accuracy, training_mean_IOU = train_step( sess, train_op, sv.global_step, metrics_op=metrics_op) #We log the final training loss logging.info('Final Loss: %s', loss) logging.info('Final Training Accuracy: %s', training_accuracy) logging.info('Final Training Mean IOU: %s', training_mean_IOU) logging.info('Final Validation Accuracy: %s', validation_accuracy) logging.info('Final Validation Mean IOU: %s', validation_mean_IOU) #Once all the training has been done, save the log files and checkpoint model logging.info('Finished training! Saving model to disk now.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) if save_images: if not os.path.exists(photo_dir): os.mkdir(photo_dir) #Plot the predictions - check validation images only logging.info('Saving the images now...') predictions_value, annotations_value = sess.run( [predictions_val, annotations_val]) for i in xrange(eval_batch_size): predicted_annotation = predictions_value[i] annotation = annotations_value[i] plt.subplot(1, 2, 1) plt.imshow(predicted_annotation) plt.subplot(1, 2, 2) plt.imshow(annotation) plt.savefig(photo_dir + "/image_" + str(i))
def run(): """ Trainer Runner Runs the ALL Detection System 2019 NCS1 Classifier Trainer. """ humanStart, clockStart = Trainer.Helpers.timerStart() Trainer.Helpers.logger.info( "ALL Detection System 2019 NCS1 Trainer started.") # Open the labels file Trainer.labels = open( Trainer.confs["Classifier"]["DatasetDir"] + "/" + Trainer.confs["Classifier"]["Labels"], 'r') # Create a dictionary to refer each label to their string name for line in Trainer.labels: label, string_name = line.split(':') string_name = string_name[:-1] # Remove newline Trainer.labelsToName[int(label)] = string_name # Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later. Trainer.items_to_descriptions = { 'image': 'A 3-channel RGB coloured image that is ex: office, people', 'label': 'A label that ,start from zero' } # Create the log directory here. Must be done here otherwise import will activate this unneededly. if not os.path.exists(Trainer.confs["Classifier"]["LogDir"]): os.mkdir(Trainer.confs["Classifier"]["LogDir"]) # Now we start to construct the graph and build our model with tf.Graph().as_default() as graph: # Set the verbosity to INFO level tf.logging.set_verbosity(tf.logging.INFO) # First create the dataset and load one batch dataset = Trainer.getSplit('train') images, _, labels = Trainer.loadBatch(dataset) # Know the number steps to take before decaying the learning rate and batches per epoch num_batches_per_epoch = dataset.num_samples // Trainer.confs[ "Classifier"]["BatchSize"] # Because one step is one batch processed num_steps_per_epoch = num_batches_per_epoch decay_steps = int(Trainer.confs["Classifier"]["EpochsBeforeDecay"] * num_steps_per_epoch) # Create the model inference with slim.arg_scope(inception_v3_arg_scope()): logits, end_points = inception_v3(images, num_classes=dataset.num_classes, is_training=True) # Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!) one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes) # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels, logits=logits) # obtain the regularization losses as well total_loss = tf.losses.get_total_loss() # Create the global step for monitoring the learning_rate and training. global_step = get_or_create_global_step() # Define your exponentially decaying learning rate lr = tf.train.exponential_decay( learning_rate=Trainer.confs["Classifier"]["LearningRate"], global_step=global_step, decay_steps=decay_steps, decay_rate=Trainer.confs["Classifier"]["LearningRateDecay"], staircase=True) # Now we can define the optimizer that takes on the learning rate optimizer = tf.train.AdamOptimizer(learning_rate=lr) # optimizer = tf.train.RMSPropOptimizer(learning_rate = lr, momentum=0.9) # Create the train_op. train_op = slim.learning.create_train_op(total_loss, optimizer) # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. predictions = tf.argmax(end_points['Predictions'], 1) probabilities = end_points['Predictions'] accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( predictions, labels) metrics_op = tf.group(accuracy_update, probabilities) # Now finally create all the summaries you need to monitor and group them into one summary op. tf.summary.scalar('losses/Total_Loss', total_loss) tf.summary.scalar('accuracy', accuracy) tf.summary.scalar('learning_rate', lr) my_summary_op = tf.summary.merge_all() # Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently. def train_step(sess, train_op, global_step, epochCount): ''' Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step ''' # Check the time for each sess run start_time = time.time() total_loss, global_step_count, _ = sess.run( [train_op, global_step, metrics_op]) time_elapsed = time.time() - start_time # Run the logging to print some results logging.info(' Epch %.2f Glb Stp %s: Loss: %.4f (%.2f sec/step)', epochCount, global_step_count, total_loss, time_elapsed) return total_loss, global_step_count # Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory sv = tf.train.Supervisor(logdir=Trainer.confs["Classifier"]["LogDir"], summary_op=None) # Run the managed session with sv.managed_session() as sess: for step in range(num_steps_per_epoch * Trainer.confs["Classifier"]["Epochs"]): # At the start of every epoch, show the vital information: if step % num_batches_per_epoch == 0: logging.info('Epoch %s/%s', step / num_batches_per_epoch + 1, Trainer.confs["Classifier"]["Epochs"]) learning_rate_value, accuracy_value = sess.run( [lr, accuracy]) logging.info('Current Learning Rate: %s', learning_rate_value) logging.info('Current Streaming Accuracy: %s', accuracy_value) # optionally, print your logits and predictions for a sanity check that things are going fine. logits_value, probabilities_value, predictions_value, labels_value = sess.run( [logits, probabilities, predictions, labels]) print('logits: \n', logits_value[:5]) print('Probabilities: \n', probabilities_value[:5]) print('predictions: \n', predictions_value[:100]) print('Labels:\n:', labels_value[:100]) # Log the summaries every 10 step. if step % 10 == 0: loss, _ = train_step(sess, train_op, sv.global_step, step / num_batches_per_epoch + 1) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) # If not, simply run the training step else: loss, _ = train_step(sess, train_op, sv.global_step, step / num_batches_per_epoch + 1) # We log the final training loss and accuracy logging.info('Final Loss: %s', loss) logging.info('Final Accuracy: %s', sess.run(accuracy)) # Once all the training has been done, save the log files and checkpoint model logging.info('Finished training! Saving model to disk now.') checkpoint_file = tf.train.latest_checkpoint( Trainer.confs["Classifier"]["LogDir"]) with tf.Graph().as_default() as graph: # images = tf.placeholder(shape=[None, ImageSize, ImageSize, 3], dtype=tf.float32, name = 'Placeholder_only') images = tf.placeholder("float", [ 1, Trainer.confs["Classifier"]["ImageSize"], Trainer.confs["Classifier"]["ImageSize"], 3 ], name="input") with slim.arg_scope(inception_v3_arg_scope()): logits, end_points = inception_v3( images, num_classes=Trainer.confs["Classifier"]["NumClasses"], is_training=False) probabilities = tf.nn.softmax(logits) saver = tf.train.Saver(slim.get_variables_to_restore()) # Setup graph def input_graph_def = graph.as_graph_def() output_node_names = Trainer.confs["Classifier"]["OutputNode"] output_graph_name = Trainer.confs["Classifier"]["ALLGraph"] with tf.Session() as sess: saver.restore(sess, checkpoint_file) # Exporting the graph print("Exporting graph...") output_graph_def = graph_util.convert_variables_to_constants( sess, input_graph_def, output_node_names.split(",")) with tf.gfile.GFile(output_graph_name, "wb") as f: f.write(output_graph_def.SerializeToString()) clockEnd, difference, humanEnd = Trainer.Helpers.timerEnd(clockStart) Trainer.Helpers.logger.info( "ALL Detection System 2019 NCS1 Trainer ended in " + str(difference))
def train(): img = tf.placeholder( shape=[config.batch_size, config.image_size, config.image_size, 3], dtype=tf.float32) anchors_num = sum([ config.Config['feature_maps'][s]**2 * config.Config['aspect_num'][s] for s in range(3) ]) input_loc_t = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32) input_conf_t = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32) input_gt_mask = tf.placeholder(shape=[ config.batch_size, config.mask_pool_shape * 2, config.mask_pool_shape * 2, 100 ], dtype=tf.int32) input_gt_box = tf.placeholder(shape=[config.batch_size, 100, 4], dtype=tf.float32) input_mask_index = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.int32) #gen = data_gen.get_batch_shapes(batch_size=config.batch_size, image_size=config.image_size,mask_pool_size=config.mask_pool_shape*2) input_gt_mask_trans = tf.transpose(input_gt_mask, [0, 3, 1, 2]) pred_loc, pred_confs, mask_fp, vbs = iv2_mask_add.gen_box(img, config) target_mask = mask_model.get_target_mask(input_gt_box, input_gt_mask_trans, input_mask_index, config) train_tensors = mask_model.get_loss(input_conf_t, input_loc_t, pred_loc, pred_confs, target_mask, mask_fp, config) global_step = get_or_create_global_step() lr = tf.train.exponential_decay(learning_rate=0.001, global_step=global_step, decay_steps=100000, decay_rate=0.7, staircase=True) tf.summary.scalar('lr', lr) sum_op = tf.summary.merge_all() optimizer = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9) train_op = slim.learning.create_train_op(train_tensors, optimizer) saver = tf.train.Saver(vbs) def restore(sess): saver.restore(sess, '/home/dsl/all_check/inception_v2.ckpt') sv = tf.train.Supervisor( logdir='/home/dsl/all_check/face_detect/coco_mask', summary_op=None, init_fn=restore) with sv.managed_session() as sess: for step in range(1000000000): data_images, data_true_box, data_true_label, data_true_mask = data_loader.q.get( ) data_loct, data_conft, data_mask_index = np_utils.get_loc_conf_mask( data_true_box, data_true_label, batch_size=config.batch_size, cfg=config.Config) feed_dict = { img: data_images, input_loc_t: data_loct, input_conf_t: data_conft, input_gt_mask: data_true_mask, input_gt_box: data_true_box, input_mask_index: data_mask_index } t = time.time() ls, step = sess.run([train_op, global_step], feed_dict=feed_dict) print(ls) if step % 10 == 0: tt = time.time() - t print(data_true_label) print('step:' + str(step) + ' ' + 'class_loss:' + str(ls[0]) + ' ' + 'loc_loss:' + str(ls[1]) + ' ' + 'mask_loss:' + str(ls[2]) + ' ' + 'timestp:' + str(tt)) summaries = sess.run(sum_op, feed_dict=feed_dict) sv.summary_computed(sess, summaries)
def begin(self): if self._replace_summary_op: self._summary_op = summary.merge_all() self._global_step = variables.get_or_create_global_step()
def run(): #Create log_dir for evaluation information if not os.path.exists(log_eval): os.mkdir(log_eval) #Just construct the graph from scratch again with tf.Graph().as_default() as graph: tf.logging.set_verbosity(tf.logging.INFO) #Get the dataset first and load one batch of validation images and labels tensors. Set is_training as False so as to use the evaluation preprocessing dataset = get_split('validation', dataset_dir) images, raw_images, labels = load_batch(dataset, batch_size=batch_size, is_training=False) #Create some information about the training steps num_batches_per_epoch = dataset.num_samples / batch_size num_steps_per_epoch = num_batches_per_epoch #Now create the inference model but set is_training=False with slim.arg_scope(inception_resnet_v2_arg_scope()): logits, end_points = inception_resnet_v2( images, num_classes=dataset.num_classes, is_training=False) # #get all the variables to restore from the checkpoint file and create the saver function to restore variables_to_restore = slim.get_variables_to_restore() saver = tf.train.Saver(variables_to_restore) def restore_fn(sess): return saver.restore(sess, checkpoint_file) #Just define the metrics to track without the loss or whatsoever predictions = tf.argmax(end_points['Predictions'], 1) accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( predictions, labels) metrics_op = tf.group(accuracy_update) #Create the global step and an increment op for monitoring global_step = get_or_create_global_step() global_step_op = tf.assign( global_step, global_step + 1 ) #no apply_gradient method so manually increasing the global_step #Create a evaluation step function def eval_step(sess, metrics_op, global_step): ''' Simply takes in a session, runs the metrics op and some logging information. ''' start_time = time.time() _, global_step_count, accuracy_value = sess.run( [metrics_op, global_step_op, accuracy]) time_elapsed = time.time() - start_time #Log some information logging.info( 'Global Step %s: Streaming Accuracy: %.4f (%.2f sec/step)', global_step_count, accuracy_value, time_elapsed) return accuracy_value #Define some scalar quantities to monitor tf.summary.scalar('Validation_Accuracy', accuracy) my_summary_op = tf.summary.merge_all() #Get your supervisor sv = tf.train.Supervisor(logdir=log_eval, summary_op=None, saver=None, init_fn=restore_fn) #Now we are ready to run in one session with sv.managed_session() as sess: for step in xrange(num_steps_per_epoch * num_epochs): sess.run(sv.global_step) #print vital information every start of the epoch as always if step % num_batches_per_epoch == 0: logging.info('Epoch: %s/%s', step / num_batches_per_epoch + 1, num_epochs) logging.info('Current Streaming Accuracy: %.4f', sess.run(accuracy)) #Compute summaries every 10 steps and continue evaluating if step % 10 == 0: eval_step(sess, metrics_op=metrics_op, global_step=sv.global_step) summaries = sess.run(my_summary_op) sv.summary_computed(sess, summaries) #Otherwise just run as per normal else: eval_step(sess, metrics_op=metrics_op, global_step=sv.global_step) #At the end of all the evaluation, show the final accuracy logging.info('Final Streaming Accuracy: %.4f', sess.run(accuracy)) #Now we want to visualize the last batch's images just to see what our model has predicted raw_images, labels, predictions = sess.run( [raw_images, labels, predictions]) for i in range(10): image, label, prediction = raw_images[i], labels[ i], predictions[i] prediction_name, label_name = dataset.labels_to_name[ prediction], dataset.labels_to_name[label] text = 'Prediction: %s \n Ground Truth: %s' % (prediction_name, label_name) img_plot = plt.imshow(image) #Set up the plot and hide axes plt.title(text) img_plot.axes.get_yaxis().set_ticks([]) img_plot.axes.get_xaxis().set_ticks([]) plt.show() logging.info( 'Model evaluation has completed! Visit TensorBoard for more information regarding your evaluation.' )
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.merge_all_summaries(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. """ if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() global_step = variables.get_or_create_global_step() init_op = control_flow_ops.group(tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, global_step=None, saver=saver) last_checkpoint = None number_of_evaluations = 0 while True: last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint) start = time.time() logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False) as sess: sv.saver.restore(sess, last_checkpoint) sv.start_queue_runners(sess) evaluation(sess, num_evals=num_evals, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) break time_to_next_eval = start + eval_interval_secs - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)