def testManagedEndOfInputOneQueue(self): # Tests that the supervisor finishes without an error when using # a fixed number of epochs, reading from a single queue. logdir = _test_dir("managed_end_of_input_one_queue") os.makedirs(logdir) data_path = self._csv_data(logdir) with ops.Graph().as_default(): # Create an input pipeline that reads the file 3 times. filename_queue = input_lib.string_input_producer([data_path], num_epochs=3) reader = io_ops.TextLineReader() _, csv = reader.read(filename_queue) rec = parsing_ops.decode_csv(csv, record_defaults=[[1], [1], [1]]) sv = supervisor.Supervisor(logdir=logdir) with sv.managed_session("") as sess: while not sv.should_stop(): sess.run(rec)
def testManagedSessionUserError(self): logdir = self._test_dir("managed_user_error") with ops.Graph().as_default(): my_op = constant_op.constant(1.0) sv = supervisor.Supervisor(logdir=logdir) last_step = None with self.assertRaisesRegex(RuntimeError, "failing here"): with sv.managed_session("") as sess: for step in range(10): last_step = step if step == 1: raise RuntimeError("failing here") else: self.evaluate(my_op) # Supervisor has been stopped. self.assertTrue(sv.should_stop()) self.assertEqual(1, last_step)
def testManagedSessionIgnoreOutOfRangeError(self): logdir = _test_dir("managed_out_of_range") with ops.Graph().as_default(): my_op = constant_op.constant(1.0) sv = supervisor.Supervisor(logdir=logdir) last_step = None with sv.managed_session("") as sess: for step in xrange(10): last_step = step if step == 3: raise errors_impl.OutOfRangeError( my_op.op.node_def, my_op.op, "all done") else: sess.run(my_op) # Supervisor has been stopped. OutOfRangeError was not thrown. self.assertTrue(sv.should_stop()) self.assertEqual(3, last_step)
def testPrepareSessionAfterStopForNonChief(self): logdir = self._test_dir("prepare_after_stop_nonchief") with ops.Graph().as_default(): sv = supervisor.Supervisor(logdir=logdir, is_chief=False) # Create a first session and then stop. sess = sv.prepare_or_wait_for_session("") sv.stop() sess.close() self.assertTrue(sv.should_stop()) # Now create a second session and test that we don't stay stopped, until # we ask to stop again. sess2 = sv.prepare_or_wait_for_session("") self.assertFalse(sv.should_stop()) sv.stop() sess2.close() self.assertTrue(sv.should_stop())
def testBasicTrainLoop(self): logdir = _test_dir("basic_train_loop") # Counts the number of calls. num_calls = [0] def train_fn(unused_sess, sv, y, a): num_calls[0] += 1 self.assertEqual("y", y) self.assertEqual("A", a) if num_calls[0] == 3: sv.request_stop() with ops.Graph().as_default(): sv = supervisor.Supervisor(logdir=logdir) basic_loops.basic_train_loop(sv, train_fn, args=(sv, "y"), kwargs={"a": "A"}) self.assertEqual(3, num_calls[0])
def testLocalInitOpForNonChief(self): logdir = self._test_dir("default_local_init_op_non_chief") with ops.Graph().as_default(): with ops.device("/job:localhost"): # A local variable. v = variables.Variable( [1.0, 2.0, 3.0], trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES]) # This shouldn't add a variable to the VARIABLES collection responsible # for variables that are saved/restored from checkpoints. self.assertEquals(len(variables.global_variables()), 0) # Suppress normal variable inits to make sure the local one is # initialized via local_init_op. sv = supervisor.Supervisor(logdir=logdir, init_op=None, is_chief=False) sess = sv.prepare_or_wait_for_session("") self.assertAllClose([1.0, 2.0, 3.0], sess.run(v)) sv.stop()
def testTFRecordReader(self): with self.cached_session(): self._tfrecord_paths = test_utils.create_tfrecord_files( tempfile.mkdtemp(), num_files=3) key, value = parallel_reader.parallel_read( self._tfrecord_paths, reader_class=io_ops.TFRecordReader, num_readers=3) sv = supervisor.Supervisor(logdir=tempfile.mkdtemp()) with sv.prepare_or_wait_for_session() as sess: sv.start_queue_runners(sess) flowers = 0 num_reads = 100 for _ in range(num_reads): current_key, _ = sess.run([key, value]) if 'flowers' in str(current_key): flowers += 1 self.assertGreater(flowers, 0) self.assertEqual(flowers, num_reads)
def testTFRecordReader(self): with self.test_session(): self._tfrecord_paths = test_utils.create_tfrecord_files( self.get_temp_dir(), num_files=3) key, value = parallel_reader.parallel_read( self._tfrecord_paths, reader_class=io_ops.TFRecordReader, num_readers=3) sv = supervisor.Supervisor(logdir=self.get_temp_dir()) with sv.managed_session() as sess: flowers = 0 num_reads = 100 for _ in range(num_reads): current_key, _ = sess.run([key, value]) if 'flowers' in str(current_key): flowers += 1 self.assertGreater(flowers, 0) self.assertEquals(flowers, num_reads)
def get_session(is_chief): g = ops.Graph() with g.as_default(): with ops.device("/job:local"): v = variables.Variable( 1.0, name="ready_for_local_init_op_restore_v_" + str(uid)) vadd = v.assign_add(1) w = variables.Variable( v, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], name="ready_for_local_init_op_restore_w_" + str(uid)) ready_for_local_init_op = variables.report_uninitialized_variables( variables.global_variables()) sv = supervisor.Supervisor( logdir=logdir, is_chief=is_chief, graph=g, recovery_wait_secs=1, ready_for_local_init_op=ready_for_local_init_op) sess = sv.prepare_or_wait_for_session(server.target) return sv, sess, v, vadd, w
def testBasicTrainLoopRetryOnAborted(self): logdir = _test_dir("basic_train_loop_exception_aborts") class AbortAndRetry: def __init__(self): self.num_calls = 0 self.retries_left = 2 def train_fn(self, unused_sess): self.num_calls += 1 if self.num_calls % 3 == 2: self.retries_left -= 1 if self.retries_left > 0: raise errors_impl.AbortedError(None, None, "Aborted here") else: raise RuntimeError("Failed Again") with ops.Graph().as_default(): sv = supervisor.Supervisor(logdir=logdir) aar = AbortAndRetry() with self.assertRaisesRegex(RuntimeError, "Failed Again"): basic_loops.basic_train_loop(sv, aar.train_fn) self.assertEqual(0, aar.retries_left)
def _verify_all_data_sources_read(self, shared_queue): with self.cached_session(): tfrecord_paths = test_utils.create_tfrecord_files( self.get_temp_dir(), num_files=3) num_readers = len(tfrecord_paths) p_reader = parallel_reader.ParallelReader(io_ops.TFRecordReader, shared_queue, num_readers=num_readers) data_files = parallel_reader.get_data_files(tfrecord_paths) filename_queue = input_lib.string_input_producer(data_files) key, value = p_reader.read(filename_queue) count0 = 0 count1 = 0 count2 = 0 num_reads = 50 sv = supervisor.Supervisor(logdir=self.get_temp_dir()) with sv.prepare_or_wait_for_session() as sess: sv.start_queue_runners(sess) for _ in range(num_reads): current_key, _ = sess.run([key, value]) if '0-of-3' in str(current_key): count0 += 1 if '1-of-3' in str(current_key): count1 += 1 if '2-of-3' in str(current_key): count2 += 1 self.assertGreater(count0, 0) self.assertGreater(count1, 0) self.assertGreater(count2, 0) self.assertEqual(count0 + count1 + count2, num_reads)
def testManagedMainErrorTwoQueues(self): # Tests that the supervisor correctly raises a main loop # error even when using multiple queues for input. logdir = self._test_dir("managed_main_error_two_queues") os.makedirs(logdir) data_path = self._csv_data(logdir) with self.assertRaisesRegexp(RuntimeError, "fail at step 3"): with ops.Graph().as_default(): # Create an input pipeline that reads the file 3 times. filename_queue = input_lib.string_input_producer( [data_path], num_epochs=3) reader = io_ops.TextLineReader() _, csv = reader.read(filename_queue) rec = parsing_ops.decode_csv(csv, record_defaults=[[1], [1], [1]]) shuff_rec = input_lib.shuffle_batch(rec, 1, 6, 4) sv = supervisor.Supervisor(logdir=logdir) with sv.managed_session("") as sess: for step in range(9): if sv.should_stop(): break elif step == 3: raise RuntimeError("fail at step 3") else: sess.run(shuff_rec)
def testLocalInitOp(self): logdir = self._test_dir("default_local_init_op") with ops.Graph().as_default(): # A local variable. v = variables.Variable([1.0, 2.0, 3.0], trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES]) # An entity which is initialized through a TABLE_INITIALIZER. w = variables.Variable([4, 5, 6], trainable=False, collections=[]) ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, w.initializer) # This shouldn't add a variable to the VARIABLES collection responsible # for variables that are saved/restored from checkpoints. self.assertEquals(len(variables.global_variables()), 0) # Suppress normal variable inits to make sure the local one is # initialized via local_init_op. sv = supervisor.Supervisor(logdir=logdir, init_op=None) sess = sv.prepare_or_wait_for_session("") self.assertAllClose([1.0, 2.0, 3.0], sess.run(v)) self.assertAllClose([4, 5, 6], sess.run(w)) sv.stop()
def do_training(train_op, init_fn=None, summary_op=None, lr=None): global savers graph = ops.get_default_graph() with graph.as_default(): global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(max_to_keep=0) with ops.name_scope('init_ops'): init_op = tf_variables.global_variables_initializer() ready_op = tf_variables.report_uninitialized_variables() local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.tables_initializer()) summary_writer = supervisor.Supervisor.USE_DEFAULT with ops.name_scope('train_step'): train_step_kwargs = {} if not FLAGS.max_number_of_steps is None: should_stop_op = math_ops.greater_equal( global_step, FLAGS.max_number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if FLAGS.log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, FLAGS.log_every_n_steps), 0) prefix = "loc/net" lp = len(prefix) vdic = { "InceptionV2" + v.op.name[lp:]: v for v in tf.trainable_variables() if v.name.startswith(prefix) and v.name.find("Logits/") < 0 } _saver = tf_saver.Saver(vdic) savers.append(_saver) for i in xrange(NUM_STN): prefix = "stn%d/net" % i lp = len(prefix) vdic = { "InceptionV2" + v.op.name[lp:]: v for v in tf.trainable_variables() if v.name.startswith(prefix) and v.name.find("Logits/") < 0 } # saver = tf.train.Saver(vdic) _saver = tf_saver.Saver(vdic) savers.append(_saver) prt("savers %d" % len(savers)) is_chief = True logdir = FLAGS.train_dir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=None, local_init_op=local_init_op, ready_for_local_init_op=None, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=FLAGS.save_summaries_secs, save_model_secs=FLAGS.save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer with sv.managed_session('', start_standard_services=False, config=None) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') try: while not sv.should_stop(): total_loss, global_step_value, should_stop = train_step( sess, train_op, global_step, lr, train_step_kwargs) current_epoch = int( math.ceil(float(global_step_value) / FLAGS.steps_in_epoch)) if global_step_value > 0 and global_step_value % FLAGS.save_every_n_steps == 0: sv.saver.save(sess, sv.save_path, global_step=sv.global_step) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
def _train_internal(graph, output_dir, train_op, loss_op, global_step_tensor, init_op, init_feed_dict, init_fn, log_every_steps, supervisor_is_chief, supervisor_master, supervisor_save_model_secs, keep_checkpoint_max, supervisor_save_summaries_steps, feed_fn, steps, fail_on_nan_loss, monitors, max_steps): """See train.""" if (steps is not None) and (max_steps is not None): raise ValueError('Can not provide both steps and max_steps.') if not output_dir: raise ValueError('Output directory should be non-empty %s.' % output_dir) if train_op is None: raise ValueError('Missing train_op.') if loss_op is None: raise ValueError('Missing loss_op.') with graph.as_default(): global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) if global_step_tensor is None: raise ValueError('No "global_step" was provided or found in the graph.') # Get current step. try: start_step = load_variable(output_dir, global_step_tensor.name) except (errors.NotFoundError, ValueError): start_step = 0 summary_writer = (get_summary_writer(output_dir) if supervisor_is_chief else None) # Add default chief monitors if none were provided. if not monitors: monitors = monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=supervisor_save_summaries_steps, summary_writer=summary_writer) if supervisor_is_chief else [] # TODO(ipolosukhin): Replace all functionality of Supervisor # with Chief-Exclusive Monitors. if not supervisor_is_chief: # Prune list of monitor to the ones runnable on all workers. monitors = [monitor for monitor in monitors if monitor.run_on_all_workers] if max_steps is None: max_steps = (start_step + steps) if steps else None # Start monitors, can create graph parts. for monitor in monitors: monitor.begin(max_steps=max_steps) supervisor = tf_supervisor.Supervisor( graph, init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT, init_feed_dict=init_feed_dict, is_chief=supervisor_is_chief, logdir=output_dir, saver=_make_saver(graph, keep_checkpoint_max), global_step=global_step_tensor, summary_op=None, summary_writer=summary_writer, save_model_secs=supervisor_save_model_secs, init_fn=init_fn) session = supervisor.PrepareSession(master=supervisor_master, start_standard_services=True) supervisor.StartQueueRunners(session) with session: get_current_step = lambda: session.run(global_step_tensor) start_step = get_current_step() last_step = start_step last_log_step = start_step loss_value = None logging.info('Training steps [%d,%s)', last_step, 'inf' if max_steps is None else str(max_steps)) excinfo = None try: while not supervisor.ShouldStop() and ( (max_steps is None) or (last_step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None outputs, should_stop = _run_with_monitors( session, last_step + 1, [train_op, loss_op], feed_dict, monitors) loss_value = outputs[loss_op.name] if np.isnan(loss_value): failure_message = 'Model diverged with loss = NaN.' if fail_on_nan_loss: logging.error(failure_message) raise monitors_lib.NanLossDuringTrainingError() else: logging.warning(failure_message) if should_stop: break this_step = get_current_step() if this_step <= last_step: logging.error( 'Global step was not incremented by train op at step %s' ': new step %d', last_step, this_step) last_step = this_step is_last_step = (max_steps is not None) and (last_step >= max_steps) if is_last_step or (last_step - last_log_step >= log_every_steps): logging.info( 'training step %d, loss = %.5f (%.3f sec/batch).', last_step, loss_value, float(time.time() - start_time)) last_log_step = last_step except errors.OutOfRangeError as e: logging.warn('Got exception during tf.learn training loop possibly ' 'due to exhausted input queue %s.', e) except StopIteration: logging.info('Exhausted input iterarator.') except BaseException as e: # pylint: disable=broad-except # Hold on to any other exceptions while we try recording a final # checkpoint and summary. excinfo = sys.exc_info() finally: try: # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop(close_summary_writer=False) # Save one last checkpoint and summaries # TODO(wicke): This should be handled by Supervisor # In case we encountered an exception in the try block before we updated # last_step, update it here (again). last_step = get_current_step() if supervisor_is_chief: ckpt_path = supervisor.save_path logging.info('Saving checkpoint for step %d to checkpoint: %s.', last_step, ckpt_path) supervisor.saver.save(session, ckpt_path, global_step=last_step) # Finish monitors. for monitor in monitors: monitor.end() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn('OutOfRangeError in tf.learn final checkpoint possibly ' 'due to exhausted input queue. Note: summary_op is not ' 'expected to trigger dequeues. %s.', e) except BaseException as e: # pylint: disable=broad-except # If we don't already have an exception to re-raise, raise this one. if not excinfo: raise # Otherwise, log this one and raise the other in the finally block. logging.error('Got exception during tf.learn final checkpoint %s.', e) finally: if excinfo: reraise(*excinfo) return loss_value
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') if not os.path.isfile(FLAGS.checkpoint_path): FLAGS.eval_dir = os.path.join(FLAGS.checkpoint_path, 'eval') else: FLAGS.eval_dir = os.path.join(os.path.dirname(FLAGS.checkpoint_path), 'eval') try: os.makedirs(FLAGS.eval_dir) except OSError: pass tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir.split(','), FLAGS.dataset_list_dir, num_samples=FLAGS.frames_per_video, modality=FLAGS.modality, split_id=FLAGS.split_id) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), batch_size=FLAGS.batch_size, is_training=False) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = dataset_data_provider.DatasetDataProvider( dataset, shuffle=FLAGS.force_random_shuffle, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size, bgr_flips=FLAGS.bgr_flip) [image, label] = provider.get(['image', 'label']) label = tf.cast(tf.string_to_number(label, tf.int32), tf.int64) label.set_shape(()) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size, model_name=FLAGS.model_name, ncrops=FLAGS.ncrops, out_dim_scale=FLAGS.out_dim_scale) images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=1 if FLAGS.store_feat is not None else FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) #################### # Define the model # #################### kwargs = {} if FLAGS.conv_endpoint is not None: kwargs['conv_endpoint'] = FLAGS.conv_endpoint logits, end_points = network_fn( images, pool_type=FLAGS.pooling, classifier_type=FLAGS.classifier_type, num_channels_stream=provider.num_channels_stream, netvlad_centers=FLAGS.netvlad_initCenters.split(','), stream_pool_type=FLAGS.stream_pool_type, **kwargs) end_points['images'] = images end_points['labels'] = labels if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() # print(dir(variables_to_restore)) print(type(variables_to_restore)) ignore_variables = [ 'stream0/vgg_16/fc8/weights:0', 'stream0/vgg_16/fc8/biases:0', ] new_variables_to_restore = [] for var in variables_to_restore: if (var.name not in ignore_variables): new_variables_to_restore.append(var) variables_to_restore = new_variables_to_restore for var in variables_to_restore: print(var.name) predictions = tf.argmax(logits, 1) # rgirdhar: Because of the following, can't use with batch_size=1 if FLAGS.batch_size > 1: labels = tf.squeeze(labels) # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), 'Recall@5': slim.metrics.streaming_recall_at_k(logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.iteritems(): summary_name = 'eval/%s' % name op = tf.scalar_summary(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = int( math.ceil(dataset.num_samples / float(FLAGS.batch_size))) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path tf.logging.info('Evaluating %s' % checkpoint_path) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if FLAGS.store_feat is not None: assert (FLAGS.store_feat_path is not None) from tensorflow.python.training import supervisor from tensorflow.python.framework import ops import h5py saver = tf.train.Saver(variables_to_restore) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=None, summary_op=None, summary_writer=None, global_step=None, saver=None) ept_names_to_store = FLAGS.store_feat.split(',') try: ept_to_store = [end_points[el] for el in ept_names_to_store] except: logging.error('Endpoint not found') logging.error('Choose from %s' % ','.join(end_points.keys())) raise KeyError() res = dict([(epname, []) for epname in ept_names_to_store]) with sv.managed_session(FLAGS.master, start_standard_services=False, config=config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) for j in range(num_batches): if j % 10 == 0: logging.info('Doing batch %d/%d' % (j, num_batches)) feats = sess.run(ept_to_store) for eid, epname in enumerate(ept_names_to_store): res[epname].append(feats[eid]) logging.info('Writing out features to %s' % FLAGS.store_feat_path) with h5py.File(FLAGS.store_feat_path, 'w') as fout: for epname in res.keys(): fout.create_dataset( epname, data=np.concatenate(res[epname], axis=0), compression='gzip', compression_opts=FLAGS.feat_store_compression_opt) else: slim.evaluation.evaluate_once( master=FLAGS.master, checkpoint_path=checkpoint_path, logdir=FLAGS.eval_dir, num_evals=num_batches, eval_op=names_to_updates.values(), variables_to_restore=variables_to_restore, session_config=config)
def main(_): if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') times = {} tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): start = time.time() tf_global_step = slim.get_or_create_global_step() times['global_step'] = time.time() - start ###################### # Select the dataset # start = time.time() dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir, suffix=FLAGS.dataset_name_suffix) times['get_dataset'] = time.time() - start #################### # Select the model # #################### start = time.time() network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) times['select_model'] = time.time() - start ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## start = time.time() provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) times['get_provider'] = time.time() - start start = time.time() [image] = provider.get(['image']) times['get_image'] = time.time() - start ##################################### # Select the preprocessing function # ##################################### start = time.time() preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) times['get_preprocessing'] = time.time() - start eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size start = time.time() image = image_preprocessing_fn(image, eval_image_size, eval_image_size) times['preprocessing'] = time.time() - start start = time.time() images = tf.train.batch( [image], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=5 * FLAGS.batch_size) times['get_batch'] = time.time() - start start = time.time() tf.image_summary('test_images', images, FLAGS.batch_size) times['image_summary'] = time.time() - start #################### # Define the model # #################### start = time.time() logits, _ = network_fn(images) times['do_network'] = time.time() - start # with tf.variable_scope('resnet_v2_152/block1/unit_1/bottleneck_v2/conv1', reuse=True): # weights = tf.get_variable('weights') # kernel_transposed = put_kernels_on_grid(weights) # scale weights to [0 1], type is still float # x_min = tf.reduce_min(weights) # x_max = tf.reduce_max(weights) # kernel_0_to_1 = (weights - x_min) / (x_max - x_min) # # # to tf.image_summary format [batch_size, height, width, channels] # kernel_transposed = tf.transpose(kernel_0_to_1, [3, 0, 1, 2]) # this will display random 3 filters from the 64 in conv1 # tf.image_summary('conv1/filters', kernel_transposed, max_images=50) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() if len(logits.get_shape()) == 4: logits = tf.reshape(logits, [int(logits.get_shape()[0]), -1]) softmax = tf.nn.softmax(logits) # predictions = tf.argmax(logits, 1) # Define the metrics: # names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ # 'Predictions': predictions, # 'Predictions': slim.metrics.streaming_accuracy(predictions, labels), # 'Predictions@5': slim.metrics.streaming_recall_at_k( # logits, labels, 5), # }) # Print the summaries to screen. # for name, value in names_to_values.iteritems(): # summary_name = 'eval/%s' % name # op = tf.scalar_summary(summary_name, value, collections=[]) # op = tf.Print(op, [value], summary_name) # tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) start = time.time() if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path times['load_checkpoint'] = time.time() - start tf.logging.info('Evaluating %s' % checkpoint_path) # evaluate_loop from tensorflow.contrib.framework.python.ops import variables from tensorflow.core.protobuf import saver_pb2 from tensorflow.python.training import saver as tf_saver from tensorflow.python.framework import ops from tensorflow.python.training import supervisor saver = tf_saver.Saver( variables_to_restore or variables.get_variables_to_restore(), write_version=saver_pb2.SaverDef.V1) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=FLAGS.eval_dir, summary_op=None, summary_writer=None, global_step=None, saver=None) # init = tf.initialize_all_variables() # sess = tf.Session() with sv.managed_session(FLAGS.master, start_standard_services=False) as sess: # sess.run(init) saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) start = time.time() final_op_value = sess.run(logits) # final_op_value = slim.evaluation.evaluate_once( # master=FLAGS.master, # checkpoint_path=checkpoint_path, # logdir=FLAGS.eval_dir, # num_evals=num_batches, # final_op=[softmax, logits], # # eval_op=names_to_updates.values(), # variables_to_restore=variables_to_restore) times['exec'] = time.time() - start print(final_op_value[1].shape) result_predict = np.reshape(final_op_value[1], (FLAGS.batch_size, final_op_value[1].shape[-1])) # print(final_op_value) print(result_predict) print(np.argsort(result_predict[:, 1])[-5:]) print(times)
def testStandardServicesWithGlobalStep(self): logdir = self._test_dir("standard_services_with_global_step") # Create a checkpoint. with ops.Graph().as_default(): v = variables.VariableV1([123], name="global_step") sv = supervisor.Supervisor(logdir=logdir) meta_graph_def = meta_graph.create_meta_graph_def( saver_def=sv.saver.saver_def) sess = sv.prepare_or_wait_for_session("") # This is where the checkpoint will appear, with step number 123. save_path = "%s-123" % sv.save_path self._wait_for_glob(save_path, 3.0) self._wait_for_glob(os.path.join(logdir, "*events*"), 3.0, for_checkpoint=False) # Wait to make sure everything is written to file before stopping. time.sleep(1) sv.stop() # There should be an event file with a version number. rr = _summary_iterator(logdir) ev = next(rr) self.assertEquals("brain.Event:2", ev.file_version) ev = next(rr) ev_graph = graph_pb2.GraphDef() ev_graph.ParseFromString(ev.graph_def) self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True), ev_graph) ev = next(rr) ev_meta_graph = meta_graph_pb2.MetaGraphDef() ev_meta_graph.ParseFromString(ev.meta_graph_def) self.assertProtoEquals(meta_graph_def, ev_meta_graph) self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True), ev_meta_graph.graph_def) ev = next(rr) # It is actually undeterministic whether SessionLog.START gets written # before the summary or the checkpoint, but this works when run 10000 times. self.assertEquals(123, ev.step) self.assertEquals(event_pb2.SessionLog.START, ev.session_log.status) first = next(rr) second = next(rr) # It is undeterministic whether the value gets written before the checkpoint # since they are on separate threads, so we check for both conditions. if first.HasField("summary"): self.assertProtoEquals( """value { tag: 'global_step/sec' simple_value: 0.0 }""", first.summary) self.assertEquals(123, second.step) self.assertEquals(event_pb2.SessionLog.CHECKPOINT, second.session_log.status) else: self.assertEquals(123, first.step) self.assertEquals(event_pb2.SessionLog.CHECKPOINT, first.session_log.status) self.assertProtoEquals( """value { tag: 'global_step/sec' simple_value: 0.0 }""", second.summary) ev = next(rr) self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status) self.assertRaises(StopIteration, lambda: next(rr)) # There should be a checkpoint file with the variable "foo" with ops.Graph().as_default(), self.cached_session() as sess: v = variables.VariableV1([-12], name="global_step") sav = saver_lib.Saver([v]) sav.restore(sess, save_path) self.assertEqual(123, self.evaluate(v)[0])
def testNoQueueRunners(self): with ops.Graph().as_default(), self.cached_session() as sess: sv = supervisor.Supervisor( logdir=self._test_dir("no_queue_runners")) self.assertEqual(0, len(sv.start_queue_runners(sess))) sv.stop()
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.merge_all_summaries(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. """ if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() global_step = variables.get_or_create_global_step() init_op = control_flow_ops.group(tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, global_step=None, saver=saver) last_checkpoint = None number_of_evaluations = 0 while True: last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint) start = time.time() logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False) as sess: sv.saver.restore(sess, last_checkpoint) sv.start_queue_runners(sess) evaluation(sess, num_evals=num_evals, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) break time_to_next_eval = start + eval_interval_secs - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def _start_standard_services(): with ops.Graph().as_default(): sv = supervisor.Supervisor(is_chief=False) sess = sv.prepare_or_wait_for_session("") sv.start_standard_services(sess)
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=None, init_fn=None, summary_op=_USE_DEFAULT, save_summaries_secs=600, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The BNS name of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.initialize_all_variables()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If None, then the session is initialized by calling `tf.initialize_local_variables()` and `tf.initialize_all_tables()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If none, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is negative. """ if train_op is None: raise ValueError('train_op cannot be None.') if sync_optimizer and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() if init_op == _USE_DEFAULT: init_op = tf_variables.initialize_all_variables() if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() cleanup_op = None if is_chief and sync_optimizer: if not isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer') # Need to create these BEFORE the supervisor finalizes the graph: with ops.control_dependencies([init_op]): init_tokens_op = sync_optimizer.get_init_tokens_op() init_op = init_tokens_op chief_queue_runner = sync_optimizer.get_chief_queue_runner() cleanup_op = sync_optimizer.get_clean_up_op() if train_step_kwargs == _USE_DEFAULT: train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal(global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, summary_op=summary_op, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) with sv.managed_session(master, start_standard_services=False) as sess: if is_chief: sv.start_standard_services(sess) elif not is_chief and startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) if is_chief and sync_optimizer: sv.start_queue_runners(sess, [chief_queue_runner]) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: break finally: if sv.is_chief and cleanup_op is not None: sess.run(cleanup_op) # This waits for service threads to finish. sv.Stop() if sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) return total_loss
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=saver) number_of_evaluations = 0 for checkpoint_path in checkpoints_iterator(checkpoint_dir, eval_interval_secs, timeout): logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: sv.saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation( sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) return final_op_value logging.info( 'Timed-out waiting for new checkpoint file. Exiting evaluation loop.') return final_op_value
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.tables_initializer()) if sync_optimizer is not None and isinstance( sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = sync_optimizer.chief_init_op else: local_init_op = sync_optimizer.local_step_init_op ready_for_local_init_op = sync_optimizer.ready_for_local_init_op else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: if not isinstance(sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def train(graph, output_dir, train_op, loss_op, global_step_tensor=None, init_op=None, init_feed_dict=None, init_fn=None, log_every_steps=10, supervisor_is_chief=True, supervisor_master='', supervisor_save_model_secs=600, supervisor_save_summaries_steps=100, feed_fn=None, steps=None, fail_on_nan_loss=True, monitors=None): """Train a model. Given `graph`, a directory to write outputs to (`output_dir`), and some ops, run a training loop. The given `train_op` performs one step of training on the model. The `loss_op` represents the objective function of the training. It is expected to increment the `global_step_tensor`, a scalar integer tensor counting training steps. This function uses `Supervisor` to initialize the graph (from a checkpoint if one is available in `output_dir`), write summaries defined in the graph, and write regular checkpoints as defined by `supervisor_save_model_secs`. Training continues until `global_step_tensor` evaluates to `max_steps`, or, if `fail_on_nan_loss`, until `loss_op` evaluates to `NaN`. In that case the program is terminated with exit code 1. Args: graph: A graph to train. It is expected that this graph is not in use elsewhere. output_dir: A directory to write outputs to. train_op: An op that performs one training step when run. loss_op: A scalar loss tensor. global_step_tensor: A tensor representing the global step. If none is given, one is extracted from the graph using the same logic as in `Supervisor`. init_op: An op that initializes the graph. If `None`, use `Supervisor`'s default. init_feed_dict: A dictionary that maps `Tensor` objects to feed values. This feed dictionary will be used when `init_op` is evaluated. init_fn: Optional callable passed to Supervisor to initialize the model. log_every_steps: Output logs regularly. The logs contain timing data and the current loss. supervisor_is_chief: Whether the current process is the chief supervisor in charge of restoring the model and running standard services. supervisor_master: The master string to use when preparing the session. supervisor_save_model_secs: Save a checkpoint every `supervisor_save_model_secs` seconds when training. supervisor_save_summaries_steps: Save summaries every `supervisor_save_summaries_steps` seconds when training. feed_fn: A function that is called every iteration to produce a `feed_dict` passed to `session.run` calls. Optional. steps: Trains for this many steps (e.g. current global step + `steps`). fail_on_nan_loss: If true, raise `NanLossDuringTrainingError` if `loss_op` evaluates to `NaN`. If false, continue training as if nothing happened. monitors: List of `BaseMonitor` subclass instances. Used for callbacks inside the training loop. Returns: The final loss value. Raises: ValueError: If `global_step_tensor` is not provided. See `tf.contrib.framework.get_global_step` for how we look it up if not provided explicitly. NanLossDuringTrainingError: If `fail_on_nan_loss` is `True`, and loss ever evaluates to `NaN`. """ if not output_dir: raise ValueError('Output directory should be non-empty.') with graph.as_default(): global_step_tensor = contrib_variables.assert_or_get_global_step( graph, global_step_tensor) if global_step_tensor is None: raise ValueError( 'No "global_step" was provided or found in the graph.') # Get current step. try: start_step = checkpoints.load_variable(output_dir, global_step_tensor.name) except (errors.NotFoundError, ValueError): start_step = 0 summary_writer = (get_summary_writer(output_dir) if supervisor_is_chief else None) # TODO(ipolosukhin): Replace all functionality of Supervisor with Monitors. if not supervisor_is_chief: # monitors should run only on the chief. monitors = [] elif not monitors: monitors = monitors_lib.get_default_monitors( loss_op=loss_op, summary_op=logging_ops.get_summary_op(), save_summary_steps=supervisor_save_summaries_steps, summary_writer=summary_writer) # Start monitors, can create graph parts. for monitor in monitors: monitor.begin(max_steps=start_step + steps) supervisor = tf_supervisor.Supervisor( graph, init_op=init_op or tf_supervisor.Supervisor.USE_DEFAULT, init_feed_dict=init_feed_dict, is_chief=supervisor_is_chief, logdir=output_dir, saver=_make_saver(graph), global_step=global_step_tensor, summary_op=None, summary_writer=summary_writer, save_model_secs=supervisor_save_model_secs, init_fn=init_fn) session = supervisor.PrepareSession(master=supervisor_master, start_standard_services=True) supervisor.StartQueueRunners(session) with session: get_current_step = lambda: session.run(global_step_tensor) start_step = get_current_step() max_steps = start_step + steps last_step = start_step last_log_step = start_step loss_value = None logging.info('Training steps [%d,%s)', last_step, 'inf' if max_steps is None else str(max_steps)) excinfo = None try: while not supervisor.ShouldStop() and ((max_steps is None) or (last_step < max_steps)): start_time = time.time() feed_dict = feed_fn() if feed_fn is not None else None outputs, should_stop = _run_with_monitors( session, last_step + 1, [train_op, loss_op], feed_dict, monitors) loss_value = outputs[loss_op.name] if np.isnan(loss_value): failure_message = 'Model diverged with loss = NaN.' if fail_on_nan_loss: logging.error(failure_message) raise NanLossDuringTrainingError() else: logging.warning(failure_message) if should_stop: break this_step = get_current_step() if this_step <= last_step: logging.error( 'Global step was not incremented by train op at step %s' ': new step %d', last_step, this_step) last_step = this_step is_last_step = (max_steps is not None) and (last_step >= max_steps) if is_last_step or (last_step - last_log_step >= log_every_steps): logging.info( 'training step %d, loss = %.5f (%.3f sec/batch).', last_step, loss_value, float(time.time() - start_time)) last_log_step = last_step except errors.OutOfRangeError as e: logging.warn( 'Got exception during tf.learn training loop possibly ' 'due to exhausted input queue %s.', e) except BaseException as e: # pylint: disable=broad-except # Hold on to any other exceptions while we try recording a final # checkpoint and summary. excinfo = sys.exc_info() finally: try: # Call supervisor.Stop() from within a try block because it re-raises # exceptions thrown by the supervised threads. supervisor.Stop(close_summary_writer=False) # Save one last checkpoint and summaries # TODO(wicke): This should be handled by Supervisor # In case we encountered an exception in the try block before we updated # last_step, update it here (again). last_step = get_current_step() if supervisor_is_chief: ckpt_path = supervisor.save_path logging.info( 'Saving checkpoint for step %d to checkpoint: %s.', last_step, ckpt_path) supervisor.saver.save(session, ckpt_path, global_step=last_step) # Finish monitors. for monitor in monitors: monitor.end() # catch OutOfRangeError which is thrown when queue is out of data (and for # other reasons as well). except errors.OutOfRangeError as e: logging.warn( 'OutOfRangeError in tf.learn final checkpoint possibly ' 'due to exhausted input queue. Note: summary_op is not ' 'expected to trigger dequeues. %s.', e) except BaseException as e: # pylint: disable=broad-except # If we don't already have an exception to re-raise, raise this one. if not excinfo: raise # Otherwise, log this one and raise the other in the finally block. logging.error( 'Got exception during tf.learn final checkpoint %s.', e) finally: if excinfo: reraise(*excinfo) return loss_value
network_fn = nets_factory.get_network_fn(model_name) end_points = network_fn(img, is_training=False) print (end_points) task1 = tf.to_int32(tf.argmax(end_points['Logits'], 1)) training_accuracy1 = slim.metrics.accuracy(task1, tf.to_int32(lb)) variables_to_restore = slim.get_variables_to_restore() checkpoint_path = latest_checkpoint(train_dir) saver = Saver(variables_to_restore) config = ConfigProto() config.gpu_options.allow_growth=True sess = Session(config=config) sv = supervisor.Supervisor(logdir=checkpoint_path, summary_op=None, summary_writer=None, global_step=None, saver=None) correct = 0 predict = 0 with sv.managed_session(master='', start_standard_services=False, config=config) as sess: saver.restore(sess, checkpoint_path) optim_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) layer = {} name = ['conv1w','conv1b', 'conv2w','conv2b', 'conv3w','conv3b', 'conv4w','conv4b', 'conv5w','conv5b', 'conv6w','conv6b', 'conv7w','conv7b',
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, session_wrapper=None, trace_every_n_steps=None, batch_size=1, num_examples=None, config_summary_list=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training, as measured by 'global_step': training will stop if global_step is greater than 'number_of_steps'. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of them. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. session_wrapper: A function that takes a `tf.Session` object as the only argument and returns a wrapped session object that has the same methods that the original object has, or `None`. Iff not `None`, the wrapped object will be used for training. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. batch_size: batch size. num_examples: The number of examples in dataset for training. dubug_tensors: Additional tensors to run for debugging. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if not isinstance(train_op, list): train_op = [train_op] # Allocate log function to each step. log_fn_list = [log.info, log.infov] def _iter_log_fn(): for log_fn in log_fn_list: yield log_fn it = itertools.cycle(_iter_log_fn()) current_log_fn = it.next() if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): sync_optimizer = [sync_optimizer] if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = training_util.get_or_create_global_step() saver = saver or tf_saver.Saver() if sync_optimizer is not None: for opt in sync_optimizer: if not isinstance( opt, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( variables.local_variables_initializer(), lookup_ops.tables_initializer()) if sync_optimizer is not None and isinstance(sync_optimizer, list): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = control_flow_ops.group( *[opt.chief_init_op for opt in sync_optimizer]) else: local_init_op = control_flow_ops.group( * [opt.local_step_init_op for opt in sync_optimizer]) ready_for_local_init_op = control_flow_ops.group( *[opt.ready_for_local_init_op for opt in sync_optimizer]) else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = [ opt.get_init_tokens_op() for opt in sync_optimizer ] chief_queue_runner = [ opt.get_chief_queue_runner() for opt in sync_optimizer ] if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer steps_in_epoch = int(num_examples / batch_size) total_loss = 0.0 should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: log.infov('Starting Session.') if session_wrapper is not None: log.info('Wrapping session with wrapper function: %s', session_wrapper) sess = session_wrapper(sess) if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) threads = sv.start_queue_runners(sess) log.infov('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, chief_queue_runner) sess.run(init_tokens_op) sess.graph.finalize() # try: if config_summary_list is not None: for config_summary in config_summary_list: sv.summary_writer.add_summary( config_summary.eval(session=sess)) while not sv.should_stop(): for _train_op in train_op: total_loss, should_stop, np_global_step = train_step_fn( sess, _train_op, global_step, train_step_kwargs, batch_size, steps_in_epoch, current_log_fn) if should_stop: log.infov('Stopping Training.') sv.request_stop() break # except errors.OutOfRangeError: # # OutOfRangeError is thrown when epoch limit per # # tf.train.limit_epochs is reached. # log.warn('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: log.warn('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) sv.stop(threads, close_summary_writer=True) def _last_checkpoint_path(sv_save_path, additional_dir_name='last'): dir_list = sv_save_path.split('/') dir_list.insert(-1, 'last') last_checkpoint_dir_path = '/'.join(dir_list[:-1]) last_checkpoint_path = '/'.join(dir_list) return last_checkpoint_dir_path, last_checkpoint_path # Save the last checkpoint again to a 'last' directory for the next training with # different configuration. last_checkpoint_dir_path, last_checkpoint_path = _last_checkpoint_path( sv.save_path, 'last') if os.path.exists(last_checkpoint_dir_path): shutil.rmtree(last_checkpoint_dir_path) os.makedirs(last_checkpoint_dir_path) sv.saver.save(sess, last_checkpoint_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. log.warn('Retrying training!') should_retry = True return total_loss
def main(): args, cfg = parse_args() train_dir = get_output_dir( 'default' if args.cfg_file is None else args.cfg_file) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('Using Config:') pprint.pprint(cfg) tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = tf.train.get_or_create_global_step() ###################### # Select the dataset # ###################### kwargs = {} if cfg.TEST.VIDEO_FRAMES_PER_VIDEO > 1: kwargs['num_samples'] = cfg.TEST.VIDEO_FRAMES_PER_VIDEO kwargs['modality'] = cfg.INPUT.VIDEO.MODALITY kwargs['split_id'] = cfg.INPUT.SPLIT_ID if args.dataset_list_dir is not None: kwargs['dataset_list_dir'] = args.dataset_list_dir elif cfg.DATASET_LIST_DIR != '': kwargs['dataset_list_dir'] = cfg.DATASET_LIST_DIR if cfg.INPUT_FILE_STYLE_LABEL != '': kwargs['input_file_style_label'] = cfg.INPUT_FILE_STYLE_LABEL dataset, num_pose_keypoints = dataset_factory.get_dataset( cfg.DATASET_NAME, cfg.TEST.DATASET_SPLIT_NAME, cfg.DATASET_DIR, **kwargs) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( cfg.MODEL_NAME, num_classes=dataset.num_classes, num_pose_keypoints=num_pose_keypoints, is_training=False, cfg=cfg) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, num_epochs=1, common_queue_capacity=2 * cfg.TEST.BATCH_SIZE, common_queue_min=cfg.TEST.BATCH_SIZE) [image, action_label] = get_input(provider, cfg, ['image', 'action_label']) # label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = cfg.MODEL_NAME image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = cfg.TRAIN.IMAGE_SIZE or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size, resize_side_min=cfg.TRAIN.RESIZE_SIDE, resize_side_max=cfg.TRAIN.RESIZE_SIDE) # additional preprocessing as required if 'flips' in args.preprocs: tf.logging.info('Flipping all images while testing!') image = tf.stack( [tf.image.flip_left_right(el) for el in tf.unstack(image)]) images, action_labels = tf.train.batch( [image, action_label], batch_size=cfg.TEST.BATCH_SIZE, # following is because if there are more, the order of batch can be # different due to different speed... so avoid that # http://stackoverflow.com/questions/35001027/does-batching-queue-tf-train-batch-not-preserve-order#comment57731040_35001027 # num_threads=1 if args.save else cfg.NUM_PREPROCESSING_THREADS, num_threads= 1, # The above was too unsafe as sometimes I forgot --save # and it would just randomize the whole thing. # This is very important so # shifting to this by default. Better safe than sorry. allow_smaller_final_batch=True if cfg.TEST.VIDEO_FRAMES_PER_VIDEO == 1 else False, # because otherwise we need to # average logits over the frames, # and that needs first dimensions # to be fully defined capacity=5 * cfg.TEST.BATCH_SIZE) #################### # Define the model # #################### logits, end_points = network_fn(images) end_points['images'] = images if cfg.TEST.MOVING_AVERAGE_DECAY: variable_averages = tf.train.ExponentialMovingAverage( cfg.TEST.MOVING_AVERAGE_DECAY, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() predictions = tf.argmax(logits, 1) if cfg.TRAIN.LOSS_FN_ACTION.startswith('multi-label'): logits = tf.sigmoid(logits) else: logits = tf.nn.softmax(logits, -1) labels = tf.squeeze(action_labels) end_points['labels'] = labels # Define the metrics: names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 'Accuracy': slim.metrics.streaming_accuracy(predictions, labels), # 'Recall@5': slim.metrics.streaming_recall_at_k( # logits, labels, 5), }) # Print the summaries to screen. for name, value in names_to_values.iteritems(): summary_name = 'eval/%s' % name op = tf.summary.scalar(summary_name, value, collections=[]) op = tf.Print(op, [value], summary_name) tf.add_to_collection(tf.GraphKeys.SUMMARIES, op) # TODO(sguada) use num_epochs=1 if cfg.TEST.MAX_NUM_BATCHES: num_batches = cfg.TEST.MAX_NUM_BATCHES else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(cfg.TEST.BATCH_SIZE)) # just test the latest trained model checkpoint_path = cfg.TEST.CHECKPOINT_PATH or train_dir if tf.gfile.IsDirectory(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) else: checkpoint_path = checkpoint_path checkpoint_step = int(checkpoint_path.split('-')[-1]) tf.logging.info('Evaluating %s' % checkpoint_path) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(logdir=train_dir) if cfg.TEST.EVAL_METRIC == 'mAP' or args.save or args.ept: from tensorflow.python.training import supervisor from tensorflow.python.framework import ops import h5py saver = tf.train.Saver(variables_to_restore) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=None, summary_op=None, summary_writer=summary_writer, global_step=None, saver=None) all_labels = [] end_points['logits'] = logits end_points_to_save = args.ept + ['logits'] end_points_to_save = list(set(end_points_to_save)) all_feats = dict([(ename, []) for ename in end_points_to_save]) start_time = time.time() with sv.managed_session('', start_standard_services=False, config=config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) for j in tqdm(range(int(math.ceil(num_batches)))): feats = sess.run([ action_labels, [end_points[ename] for ename in end_points_to_save] ]) all_labels.append(feats[0]) for ept_id, ename in enumerate(end_points_to_save): all_feats[ename].append(feats[1][ept_id]) print(time.time() - start_time) APs = [] all_labels = np.concatenate(all_labels) if args.save or args.ept: res_outdir = os.path.join(train_dir, 'Features/') mkdir_p(res_outdir) outfpath = args.outfpath or os.path.join( res_outdir, 'features_ckpt_{}_{}.h5'.format( cfg.TEST.DATASET_SPLIT_NAME, checkpoint_step)) print( 'Saving the features/logits/labels to {}'.format(outfpath)) with h5py.File(outfpath, 'a') as fout: for ename in end_points_to_save: if ename in fout: tf.logging.warning( 'Deleting {} from output HDF5 to write the ' 'new features.'.format(ename)) del fout[ename] if ename == 'labels': feat_to_save = np.array(all_feats[ename]) else: feat_to_save = np.concatenate(all_feats[ename]) try: fout.create_dataset(ename, data=feat_to_save, compression='gzip', compression_opts=9) except: pdb.set_trace( ) # manually deal with it and continue if 'labels' in fout: del fout['labels'] fout.create_dataset('labels', data=all_labels, compression='gzip', compression_opts=9) if args.ept: tf.logging.info( 'Evaluation had --ept passed in. ' 'This indicates script was used for feature ' 'extraction. Hence, not performing any evaluation.') return # Evaluation code all_logits = np.concatenate(all_feats['logits']) acc = np.mean(all_logits.argmax(axis=1) == all_labels) mAP = compute_map(all_logits, all_labels)[0] print('Mean AP: {}'.format(mAP)) print('Accuracy: {}'.format(acc)) summary_writer.add_summary(tf.Summary(value=[ tf.Summary.Value(tag='mAP/{}'.format( cfg.TEST.DATASET_SPLIT_NAME), simple_value=mAP) ]), global_step=checkpoint_step) summary_writer.add_summary(tf.Summary(value=[ tf.Summary.Value(tag='Accuracy/{}'.format( cfg.TEST.DATASET_SPLIT_NAME), simple_value=acc) ]), global_step=checkpoint_step) else: slim.evaluation.evaluate_once( master='', checkpoint_path=checkpoint_path, logdir=train_dir, num_evals=num_batches, eval_op=names_to_updates.values(), variables_to_restore=variables_to_restore, session_config=config)
def main(_): if not FLAGS.dataset_dir: raise ValueError( 'You must supply the dataset directory with --dataset_dir') print("START!") tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): tf_global_step = slim.get_or_create_global_step() ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) #################### # Select the model # #################### network_fn = nets_factory.get_network_fn( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), is_training=False) #print(dataset.num_classes) #print(dir(dataset)) #print(dataset.num_samples) #print(dataset.get_shape()) ############################################################## # Create a dataset provider that loads data from the dataset # ############################################################## provider = slim.dataset_data_provider.DatasetDataProvider( dataset, shuffle=False, common_queue_capacity=2 * FLAGS.batch_size, common_queue_min=FLAGS.batch_size) files = True if files: [image, label, filename] = provider.get(['image', 'label', 'filename']) else: [image, label] = provider.get(['image', 'label']) label -= FLAGS.labels_offset ##################################### # Select the preprocessing function # ##################################### preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name image_preprocessing_fn = preprocessing_factory.get_preprocessing( preprocessing_name, is_training=False) eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size image = image_preprocessing_fn(image, eval_image_size, eval_image_size) if files: images, labels, filenames = tf.train.batch( [image, label, filename], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=FLAGS.batch_size, allow_smaller_final_batch=True) else: images, labels = tf.train.batch( [image, label], batch_size=FLAGS.batch_size, num_threads=FLAGS.num_preprocessing_threads, capacity=FLAGS.batch_size, allow_smaller_final_batch=True) #################### # Define the model # #################### logits, endpoints = network_fn(images) if FLAGS.moving_average_decay: variable_averages = tf.train.ExponentialMovingAverage( FLAGS.moving_average_decay, tf_global_step) variables_to_restore = variable_averages.variables_to_restore( slim.get_model_variables()) variables_to_restore[tf_global_step.op.name] = tf_global_step else: variables_to_restore = slim.get_variables_to_restore() probabilities = tf.nn.softmax(logits) # TODO(sguada) use num_epochs=1 if FLAGS.max_num_batches: num_batches = FLAGS.max_num_batches else: # This ensures that we make a single pass over all of the data. num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size)) if tf.gfile.IsDirectory(FLAGS.checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path) else: checkpoint_path = FLAGS.checkpoint_path init_fn = slim.assign_from_checkpoint_fn(checkpoint_path, variables_to_restore) tf.logging.info('Evaluating %s' % checkpoint_path) ### import time from tensorflow.contrib.framework.python.ops import variables from tensorflow.python.framework import ops from tensorflow.python.ops import logging_ops from tensorflow.python.platform import tf_logging as logging from tensorflow.python.training import saver as tf_saver from tensorflow.python.training import summary_io from tensorflow.python.training import supervisor from tensorflow.python.training import training_util saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) #summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=FLAGS.eval_dir, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) import collections with sv.managed_session(FLAGS.master, start_standard_services=False, config=None) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) if FLAGS.result_type == "classify": ##export classification classifications = {"classifications": {}} filenamelist = [] for i in xrange(int(num_batches) + 1): np_probabilities, np_labels, np_filenames, np_endpoints = sess.run( [probabilities, labels, filenames, endpoints]) #print({i:endpoints[i].get_shape() for i in endpoints.keys()}) #return -1 for j in xrange(FLAGS.batch_size): if not np_filenames[j] in filenamelist: filenamelist.append(np_filenames[j]) tmpprob = [] for l in np.argsort( np_probabilities[j, :] )[::-1][:5]: #iterate over best 5 probs tmpprob.append([ str(dataset.labels_to_names[l]).rstrip( "\r"), "{0:.2f}".format( np_probabilities[j, l] * 100) ]) tmp = {np_filenames[j]: tmpprob} classifications["classifications"].update(tmp) else: pass print(i) print(len(classifications["classifications"])) #print(filenamelist) sortedclass = collections.OrderedDict() for k in sorted(classifications["classifications"]): sortedclass.update( {k: classifications["classifications"][k]}) classifications["classifications"] = sortedclass jsonecoded = json.dumps(classifications) loadconf = open( os.path.join(FLAGS.result_path, FLAGS.result_name + ".json"), 'wb') loadconf.write(jsonecoded) loadconf.close() if FLAGS.result_type == "stats": np_probabilities, np_labels, np_filenames, np_endpoints = sess.run( [probabilities, labels, filenames, endpoints]) print({i: endpoints[i].get_shape() for i in endpoints.keys()}) #layer shapes allparams = 0 for variable in tf.trainable_variables(): #iterate over vars shape = variable.get_shape() currpar = 1 for dim in shape: #iterate over shape of var currpar *= dim.value allparams += currpar #add print(allparams) return -1 #kill if FLAGS.result_type == "decaf": ##extract DeCAFs features = [] filenamelist = [] layerdefinition = { "alexnet_v2": "alexnet_v2/fc7/Relu:0", "inception_v1": "MaxPool_0a_7x7", "inception_v3": "AvgPool_1a_{}x{}", "inception_resnet_v2": "AvgPool_1a_8x8", "vgg_16": "vgg_16/fc7/Relu:0", "resnet_v1_152": "pool5" } for i in xrange(int(num_batches)): np_probabilities, np_labels, np_filenames, np_endpoints = sess.run( [probabilities, labels, filenames, endpoints]) for j in xrange(FLAGS.batch_size): if not np_filenames[j] in filenamelist: filenamelist.append(np_filenames[j]) tmp_descr = (np_endpoints[layerdefinition[ FLAGS.model_name]][j][0][0]).tolist() tmp_descr.insert(0, (np_filenames[j]).replace( ".jpg", "")) features.append(tmp_descr) print(i) toARFF( features, FLAGS.result_name, os.path.join(FLAGS.result_path, FLAGS.result_name + ".arff"))
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() global_step = variables.get_or_create_global_step() saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore(), write_version=saver_pb2.SaverDef.V1) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, initial_op=initial_op, initial_op_feed_dict=initial_op_feed_dict, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_op_value