def main(args): if not os.path.exists(FLAGS.checkpoint): tf.logging.fatal( 'Checkpoint %s does not exist. Have you download it? See tools/download_data.sh', FLAGS.checkpoint) g = tf.Graph() with g.as_default(): input_image = PreprocessImage(FLAGS.image_path[0]) with slim.arg_scope(inception.inception_v3_arg_scope()): logits, end_points = inception.inception_v3( input_image, num_classes=FLAGS.num_classes, is_training=False) bottleneck = end_points['PreLogits'] init_op = control_flow_ops.group( variables.initialize_all_variables(), variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver() sess = tf.Session() saver.restore(sess, FLAGS.checkpoint) # Run the evaluation on the image bottleneck_eval = np.squeeze(sess.run(bottleneck)) first = True for val in bottleneck_eval: if not first: sys.stdout.write(",") first = False sys.stdout.write('{:.3f}'.format(val)) sys.stdout.write('\n')
def prep_graph(): global predictions global labelmap global label_dict global sess global input_image global food_list food_list = [] with open(food_names) as f: for x in f: food_list.append(x.rstrip()) g = tf.Graph() with g.as_default(): input_image = tf.placeholder(tf.string) processed_image = PreprocessImage(input_image) with slim.arg_scope(inception.inception_v3_arg_scope()): logits, end_points = inception.inception_v3(processed_image, num_classes=6012, is_training=False) predictions = end_points['multi_predictions'] = tf.nn.sigmoid( logits, name='multi_predictions') init_op = control_flow_ops.group( variables.initialize_all_variables(), variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver() sess = tf.Session() saver.restore(sess, checkpoint) labelmap, label_dict = LoadLabelMaps(6012, labelmap_file, label_dict_file)
def _get_local_init_op(): local_init_op = _get_first_op_from_collection(ops.GraphKeys.LOCAL_INIT_OP) if local_init_op is None: op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()] if op_list: local_init_op = control_flow_ops.group(*op_list) ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op) return local_init_op
def _export_graph(graph, saver, checkpoint_path, export_dir, default_graph_signature, named_graph_signatures, exports_to_keep): """Exports graph via session_bundle, by creating a Session.""" with graph.as_default(): with tf_session.Session('') as session: variables.initialize_local_variables() data_flow_ops.initialize_all_tables() saver.restore(session, checkpoint_path) export = exporter.Exporter(saver) export.init(init_op=control_flow_ops.group( variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()), default_graph_signature=default_graph_signature, named_graph_signatures=named_graph_signatures) export.export(export_dir, contrib_variables.get_global_step(), session, exports_to_keep=exports_to_keep)
def _get_local_init_op(): local_init_op = _get_first_op_from_collection( ops.GraphKeys.LOCAL_INIT_OP) if local_init_op is None: op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()] if op_list: local_init_op = control_flow_ops.group(*op_list) ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op) return local_init_op
def _assert_metrics(test_case, expected_loss, expected_eval_metrics, model_fn_ops): test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4) for k in six.iterkeys(expected_eval_metrics): test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops)) variables.initialize_local_variables().run() for key, expected_value in six.iteritems(expected_eval_metrics): value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key] update = update_tensor.eval() test_case.assertAlmostEqual( expected_value, update, places=4, msg="%s: update, expected %s, got %s." % (key, expected_value, update)) value = value_tensor.eval() test_case.assertAlmostEqual( expected_value, value, places=4, msg="%s: value, expected %s, got %s." % (key, expected_value, value))
def run(self, num_batches=None, graph=None, session=None, start_queues=True, initialize_variables=True, **kwargs): """Builds and runs the columns of the `DataFrame` and yields batches. This is a generator that yields a dictionary mapping column names to evaluated columns. Args: num_batches: the maximum number of batches to produce. If none specified, the returned value will iterate through infinite batches. graph: the `Graph` in which the `DataFrame` should be built. session: the `Session` in which to run the columns of the `DataFrame`. start_queues: if true, queues will be started before running and halted after producting `n` batches. initialize_variables: if true, variables will be initialized. **kwargs: Additional keyword arguments e.g. `num_epochs`. Yields: A dictionary, mapping column names to the values resulting from running each column for a single batch. """ if graph is None: graph = ops.get_default_graph() with graph.as_default(): if session is None: session = sess.Session() self_built = self.build(**kwargs) keys = list(self_built.keys()) cols = list(self_built.values()) if initialize_variables: if variables.local_variables(): session.run(variables.initialize_local_variables()) if variables.all_variables(): session.run(variables.initialize_all_variables()) if start_queues: coord = coordinator.Coordinator() threads = qr.start_queue_runners(sess=session, coord=coord) i = 0 while num_batches is None or i < num_batches: i += 1 try: values = session.run(cols) yield collections.OrderedDict(zip(keys, values)) except errors.OutOfRangeError: break if start_queues: coord.request_stop() coord.join(threads)
def label(image_path, checkpoint="openimages_dataset/data/2016_08/model.ckpt", num_classes=6012, labelmap_path="openimages_dataset/data/2016_08/labelmap.txt", dict_path="openimages_dataset/dict.csv", threshold=0.5, rounding_digits=1): if not os.path.exists(checkpoint): tf.logging.fatal( 'Checkpoint %s does not exist. Have you download it? See tools/download_data.sh', checkpoint) g = tf.Graph() with g.as_default(): input_image = PreprocessImage(image_path) with slim.arg_scope(inception.inception_v3_arg_scope()): logits, end_points = inception.inception_v3( input_image, num_classes=num_classes, is_training=False) predictions = end_points['multi_predictions'] = tf.nn.sigmoid( logits, name='multi_predictions') init_op = control_flow_ops.group( variables.initialize_all_variables(), variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) saver.restore(sess, checkpoint) # Run the evaluation on the image predictions_eval = np.squeeze(sess.run(predictions)) # Print top(n) results labelmap, label_dict = LoadLabelMaps(num_classes, labelmap_path, dict_path) top_k = predictions_eval.argsort()[:][::-1] returned_labels = [] for idx in top_k: mid = labelmap[idx] display_name = label_dict.get(mid, 'unknown') score = predictions_eval[idx] if score < threshold: if returned_labels: break else: threshold -= 0.1 if threshold < 0.1: break returned_labels.append((display_name, score)) return returned_labels
def main_op(): """Returns a main op to init variables and tables. Returns the main op including the group of ops that initializes all variables, initializes local variables and initialize all tables. Returns: The set of ops to be run as part of the main op upon the load operation. """ init = tf_variables.initialize_all_variables() init_local = tf_variables.initialize_local_variables() init_tables = tf_data_flow_ops.initialize_all_tables() return tf.group(init, init_local, init_tables)
def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None): """Run `output_dict` tensors with each input in `feed_dicts`. If `restore_checkpoint_path` is supplied, restore from checkpoint. Otherwise, init all variables. Args: output_dict: A `dict` mapping string names to `Tensor` objects to run. Tensors must all be from the same graph. feed_dicts: Iterable of `dict` objects of input values to feed. restore_checkpoint_path: A string containing the path to a checkpoint to restore. Yields: A sequence of dicts of values read from `output_dict` tensors, one item yielded for each item in `feed_dicts`. Keys are the same as `output_dict`, values are the results read from the corresponding `Tensor` in `output_dict`. Raises: ValueError: if `output_dict` or `feed_dicts` is None or empty. """ if not output_dict: raise ValueError('output_dict is invalid: %s.' % output_dict) if not feed_dicts: raise ValueError('feed_dicts is invalid: %s.' % feed_dicts) graph = contrib_ops.get_graph_from_inputs(output_dict.values()) with graph.as_default() as g: with tf_session.Session('') as session: session.run( resources.initialize_resources(resources.shared_resources() + resources.local_resources())) if restore_checkpoint_path: _restore_from_checkpoint(session, g, restore_checkpoint_path) else: session.run(variables.initialize_all_variables()) session.run(variables.initialize_local_variables()) session.run(data_flow_ops.initialize_all_tables()) coord = coordinator.Coordinator() threads = None try: threads = queue_runner.start_queue_runners(session, coord=coord) for f in feed_dicts: yield session.run(output_dict, f) finally: coord.request_stop() if threads: coord.join(threads, stop_grace_period_secs=120)
def main(args): if not os.path.exists(FLAGS.checkpoint): tf.logging.fatal( 'Checkpoint %s does not exist. Have you download it? See tools/download_data.sh', FLAGS.checkpoint) g = tf.Graph() with g.as_default(): input_image = tf.placeholder(tf.string) processed_image = PreprocessImage(input_image) with slim.arg_scope(inception.inception_v3_arg_scope()): logits, end_points = inception.inception_v3( processed_image, num_classes=FLAGS.num_classes, is_training=False) predictions = end_points['multi_predictions'] = tf.nn.sigmoid( logits, name='multi_predictions') init_op = control_flow_ops.group( variables.initialize_all_variables(), variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver() sess = tf.Session() saver.restore(sess, FLAGS.checkpoint) # Run the evaluation on the images for image_path in FLAGS.image_path: if not os.path.exists(image_path): tf.logging.fatal('Input image does not exist %s', FLAGS.image_path[0]) img_data = tf.gfile.FastGFile(image_path).read() print(image_path) predictions_eval = np.squeeze( sess.run(predictions, {input_image: img_data})) # Print top(n) results labelmap, label_dict = LoadLabelMaps(FLAGS.num_classes, FLAGS.labelmap, FLAGS.dict) top_k = predictions_eval.argsort()[-FLAGS.n:][::-1] for idx in top_k: mid = labelmap[idx] display_name = label_dict.get(mid, 'unknown') score = predictions_eval[idx] print('{}: {} - {} (score = {:.2f})'.format( idx, mid, display_name, score)) print()
def _init_local_init_op(self, local_init_op=USE_DEFAULT): """Initializes local_init_op. Args: local_init_op: `Operation` run for every new supervisor instance. If set to USE_DEFAULT create an op based on the `LOCAL_INITIALIZERS` graph collection. """ if local_init_op is Supervisor.USE_DEFAULT: local_init_op = self._get_first_op_from_collection(ops.GraphKeys.LOCAL_INIT_OP) if local_init_op is None: op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()] if op_list: local_init_op = control_flow_ops.group(*op_list) ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op) self._local_init_op = local_init_op
def run_feeds_iter(output_dict, feed_dicts, restore_checkpoint_path=None): """Run `output_dict` tensors with each input in `feed_dicts`. If `restore_checkpoint_path` is supplied, restore from checkpoint. Otherwise, init all variables. Args: output_dict: A `dict` mapping string names to `Tensor` objects to run. Tensors must all be from the same graph. feed_dicts: Iterable of `dict` objects of input values to feed. restore_checkpoint_path: A string containing the path to a checkpoint to restore. Yields: A sequence of dicts of values read from `output_dict` tensors, one item yielded for each item in `feed_dicts`. Keys are the same as `output_dict`, values are the results read from the corresponding `Tensor` in `output_dict`. Raises: ValueError: if `output_dict` or `feed_dicts` is None or empty. """ if not output_dict: raise ValueError('output_dict is invalid: %s.' % output_dict) if not feed_dicts: raise ValueError('feed_dicts is invalid: %s.' % feed_dicts) graph = contrib_ops.get_graph_from_inputs(output_dict.values()) with graph.as_default() as g: with tf_session.Session('') as session: if restore_checkpoint_path: _restore_from_checkpoint(session, g, restore_checkpoint_path) else: session.run(variables.initialize_all_variables()) session.run(variables.initialize_local_variables()) session.run(data_flow_ops.initialize_all_tables()) coord = coordinator.Coordinator() threads = None try: threads = queue_runner.start_queue_runners(session, coord=coord) for f in feed_dicts: yield session.run(output_dict, f) finally: coord.request_stop() if threads: coord.join(threads, stop_grace_period_secs=120)
def _init_local_init_op(self, local_init_op=USE_DEFAULT): """Initializes local_init_op. Args: local_init_op: `Operation` run for every new supervisor instance. If set to USE_DEFAULT, use the first op from the GraphKeys.LOCAL_INIT_OP collection. If the collection is empty, create an op that initializes all local variables and all tables. """ if local_init_op is Supervisor.USE_DEFAULT: local_init_op = self._get_first_op_from_collection(ops.GraphKeys.LOCAL_INIT_OP) if local_init_op is None: op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()] if op_list: local_init_op = control_flow_ops.group(*op_list) ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op) self._local_init_op = local_init_op
def _init_local_init_op(self, local_init_op=USE_DEFAULT): """Initializes local_init_op. Args: local_init_op: `Operation` run for every new supervisor instance. If set to USE_DEFAULT create an op based on the `LOCAL_INITIALIZERS` graph collection. """ if local_init_op is Supervisor.USE_DEFAULT: local_init_op = self._get_first_op_from_collection( ops.GraphKeys.LOCAL_INIT_OP) if local_init_op is None: op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()] if op_list: local_init_op = control_flow_ops.group(*op_list) ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op) self._local_init_op = local_init_op
def _init_local_init_op(self, local_init_op=USE_DEFAULT): """Initializes local_init_op. Args: local_init_op: `Operation` run for every new supervisor instance. If set to USE_DEFAULT, use the first op from the GraphKeys.LOCAL_INIT_OP collection. If the collection is empty, create an op that initializes all local variables and all tables. """ if local_init_op is Supervisor.USE_DEFAULT: local_init_op = self._get_first_op_from_collection( ops.GraphKeys.LOCAL_INIT_OP) if local_init_op is None: op_list = [variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()] if op_list: local_init_op = control_flow_ops.group(*op_list) ops.add_to_collection(ops.GraphKeys.LOCAL_INIT_OP, local_init_op) self._local_init_op = local_init_op
def train( train_op, logdir, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, init_fn=None, summary_op=_USE_DEFAULT, save_summaries_secs=600, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: the directory where training logs are written to. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The BNS name of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. init_feed_dict: A feed dictionary to use when executing the `init_op`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If none, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is negative. """ if train_op is None: raise ValueError('train_op cannot be None.') if sync_optimizer and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() if init_op is None: init_op = control_flow_ops.group( tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), tf_variables.initialize_all_tables()) if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() local_init_op = None cleanup_op = None if is_chief and sync_optimizer: if not isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer') # Need to create these BEFORE the supervisor finalizes the graph: local_init_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() cleanup_op = sync_optimizer.get_clean_up_op() if number_of_steps: # Need to subtract 1 since the check for greater/equality is done # concurrently with the increment of global_step. # TODO(nsilberman): add a dependency to ensure the order of operations. should_stop_op = math_ops.greater_equal(global_step, number_of_steps-1) else: should_stop_op = constant_op.constant(False) should_log_op = math_ops.equal(math_ops.mod(global_step, log_every_n_steps), 0) sv = supervisor.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, summary_op=summary_op, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) with sv.managed_session(master, start_standard_services=False) as sess: if is_chief: sv.start_standard_services(sess) elif not is_chief and startup_delay_steps > 0: _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) if is_chief and sync_optimizer: sv.start_queue_runners(sess, [chief_queue_runner]) total_loss = train_loop( sv, sess, train_op, should_stop_op, should_log_op, global_step, cleanup_op) # This waits for service threads to finish. sv.Stop() if sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) return total_loss
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.merge_all_summaries(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. """ if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() global_step = variables.get_or_create_global_step() init_op = control_flow_ops.group(tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, global_step=None, saver=saver) last_checkpoint = None number_of_evaluations = 0 while True: last_checkpoint = wait_for_new_checkpoint(checkpoint_dir, last_checkpoint) start = time.time() logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session(master, start_standard_services=False) as sess: sv.saver.restore(sess, last_checkpoint) sv.start_queue_runners(sess) evaluation(sess, num_evals=num_evals, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) number_of_evaluations += 1 if (max_number_of_evaluations and number_of_evaluations >= max_number_of_evaluations): logging.info('Reached max_number_of_evaluations=%s. Exit', max_number_of_evaluations) break time_to_next_eval = start + eval_interval_secs - time.time() if time_to_next_eval > 0: time.sleep(time_to_next_eval)
def train( train_op, logdir, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, init_fn=None, summary_op=_USE_DEFAULT, save_summaries_secs=600, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: the directory where training logs are written to. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The BNS name of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. init_feed_dict: A feed dictionary to use when executing the `init_op`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If none, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is negative. """ if train_op is None: raise ValueError('train_op cannot be None.') if sync_optimizer and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() if init_op is None: init_op = control_flow_ops.group( tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), tf_variables.initialize_all_tables()) if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() local_init_op = None cleanup_op = None if is_chief and sync_optimizer: if not isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer') # Need to create these BEFORE the supervisor finalizes the graph: local_init_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() cleanup_op = sync_optimizer.get_clean_up_op() if number_of_steps: should_stop_op = math_ops.greater_equal(global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) should_log_op = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) sv = supervisor.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, summary_op=summary_op, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) with sv.managed_session(master, start_standard_services=False) as sess: if is_chief: sv.start_standard_services(sess) elif not is_chief and startup_delay_steps > 0: _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) if is_chief and sync_optimizer: sv.start_queue_runners(sess, [chief_queue_runner]) total_loss = train_loop( sv, sess, train_op, should_stop_op, should_log_op, global_step, cleanup_op) # This waits for service threads to finish. sv.Stop() if sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) return total_loss
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The BNS name of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.initialize_all_variables()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.initialize_local_variables()` and `tf.initialize_all_tables()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.initialize_all_variables() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT cleanup_op = None if is_chief and sync_optimizer is not None: if not isinstance( sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer, sync_replicas_optimizer.SyncReplicasOptimizerV2)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer or ' 'tf.train.SyncReplicasOptimizerV2.') # Need to create these BEFORE the supervisor finalizes the graph: with ops.control_dependencies([init_op]): init_tokens_op = sync_optimizer.get_init_tokens_op() init_op = init_tokens_op chief_queue_runner = sync_optimizer.get_chief_queue_runner() if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): cleanup_op = sync_optimizer.get_clean_up_op() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break if logdir and sv.is_chief: logging.info( 'Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except: if sv.is_chief and cleanup_op is not None: logging.info('About to execute sync_clean_up_op!') sess.run(cleanup_op) raise except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def _default_local_init_op(): return control_flow_ops.group(variables.initialize_local_variables(), data_flow_ops.initialize_all_tables())
def evaluate_once(checkpoint_path, logdir, master='', num_evals=1, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. master: The BNS address of the TensorFlow master. num_evals: The number of times to run `eval_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.merge_all_summaries(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() global_step = variables.get_or_create_global_step() init_op = control_flow_ops.group(tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_op_value
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The BNS name of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.initialize_all_variables()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.initialize_local_variables()` and `tf.initialize_all_tables()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, or if `number_of_steps` is negative. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if sync_optimizer and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() if init_op == _USE_DEFAULT: init_op = tf_variables.initialize_all_variables() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) cleanup_op = None if is_chief and sync_optimizer: if not isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer') # Need to create these BEFORE the supervisor finalizes the graph: with ops.control_dependencies([init_op]): init_tokens_op = sync_optimizer.get_init_tokens_op() init_op = init_tokens_op chief_queue_runner = sync_optimizer.get_chief_queue_runner() cleanup_op = sync_optimizer.get_clean_up_op() if train_step_kwargs == _USE_DEFAULT: train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal(global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) sv = supervisor.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) should_retry = True while should_retry: try: should_retry = False with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer: sv.start_queue_runners(sess, [chief_queue_runner]) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except: if sv.is_chief and cleanup_op is not None: logging.info('About to execute sync_clean_up_op!') sess.run(cleanup_op) raise except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.merge_all_summaries(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = logging_ops.merge_all_summaries() global_step = variables.get_or_create_global_step() init_op = control_flow_ops.group(tf_variables.initialize_all_variables(), tf_variables.initialize_local_variables(), data_flow_ops.initialize_all_tables()) saver = tf_saver.Saver(variables_to_restore or variables.get_variables_to_restore()) summary_writer = summary_io.SummaryWriter(logdir) sv = supervisor.Supervisor(graph=ops.get_default_graph(), logdir=logdir, init_op=init_op, summary_op=None, summary_writer=None, global_step=None, saver=None) logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: saver.restore(sess, checkpoint_path) sv.start_queue_runners(sess) final_op_value = evaluation(sess, num_evals=num_evals, eval_op=eval_op, eval_op_feed_dict=eval_op_feed_dict, final_op=final_op, final_op_feed_dict=final_op_feed_dict, summary_op=summary_op, summary_op_feed_dict=summary_op_feed_dict, summary_writer=summary_writer, global_step=global_step) logging.info('Finished evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) return final_op_value