def testAddImageComparisonSummaries(self): summaries.add_image_comparison_summaries( get_gan_model(), display_diffs=True) self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): summary.merge_all().eval()
def testAddGanModelSummaries(self): summaries.add_gan_model_summaries(get_gan_model()) self.assertEquals(3, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): variables.global_variables_initializer().run() summary.merge_all().eval()
def _test_add_regularization_loss_summaries_impl(self, get_model_fn, expected_num_summary_ops): summaries.add_regularization_loss_summaries(get_model_fn()) self.assertEquals(expected_num_summary_ops, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): summary.merge_all().eval()
def _test_add_image_comparison_summaries_impl(self, get_model_fn, expected_num_summary_ops): summaries.add_image_comparison_summaries(get_model_fn(), display_diffs=True) self.assertEquals(expected_num_summary_ops, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): summary.merge_all().eval()
def test_add_image_comparison_summaries_for_stargan(self): summaries.add_stargan_image_summaries(get_stargan_model()) self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) summary.merge_all().eval()
def _test_add_gan_model_summaries_impl(self, get_model_fn, expected_num_summary_ops): summaries.add_gan_model_summaries(get_model_fn()) self.assertEquals(expected_num_summary_ops, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): variables.global_variables_initializer().run() summary.merge_all().eval()
def testClassicSummaryOpsErrorOut(self): x = constant_op.constant(42) x_summary = summary.scalar('x', x) y = constant_op.constant([1, 3, 3, 7]) y_summary = summary.histogram('hist', y) with self.assertRaisesRegexp( RuntimeError, r'Merging tf\.summary\.\* ops is not compatible with eager execution'): summary.merge([x_summary, y_summary]) with self.assertRaisesRegexp( RuntimeError, r'Merging tf\.summary\.\* ops is not compatible with eager execution'): summary.merge_all()
def testTFSummaryScalar(self): """Verify processing of tf.summary.scalar.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) writer = SummaryToEventTransformer(event_sink) with self.test_session() as sess: ipt = array_ops.placeholder(dtypes.float32) summary_lib.scalar('scalar1', ipt) summary_lib.scalar('scalar2', ipt * ipt) merged = summary_lib.merge_all() writer.add_graph(sess.graph) for i in xrange(10): summ = sess.run(merged, feed_dict={ipt: i}) writer.add_summary(summ, global_step=i) accumulator = ea.EventAccumulator(event_sink) accumulator.Reload() seq1 = [ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)] seq2 = [ ea.ScalarEvent( wall_time=0, step=i, value=i * i) for i in xrange(10) ] self.assertTagsEqual(accumulator.Tags(), { ea.SCALARS: ['scalar1', 'scalar2'], ea.GRAPH: True, ea.META_GRAPH: False, }) self.assertEqual(accumulator.Scalars('scalar1'), seq1) self.assertEqual(accumulator.Scalars('scalar2'), seq2) first_value = accumulator.Scalars('scalar1')[0].value self.assertTrue(isinstance(first_value, float))
def testTFSummaryTensor(self): """Verify processing of tf.summary.tensor.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) writer = SummaryToEventTransformer(event_sink) with self.test_session() as sess: summary_lib.tensor_summary('scalar', constant_op.constant(1.0)) summary_lib.tensor_summary('vector', constant_op.constant( [1.0, 2.0, 3.0])) summary_lib.tensor_summary('string', constant_op.constant(six.b('foobar'))) merged = summary_lib.merge_all() summ = sess.run(merged) writer.add_summary(summ, 0) accumulator = ea.EventAccumulator(event_sink) accumulator.Reload() self.assertTagsEqual(accumulator.Tags(), { ea.TENSORS: ['scalar', 'vector', 'string'], }) scalar_proto = accumulator.Tensors('scalar')[0].tensor_proto scalar = tensor_util.MakeNdarray(scalar_proto) vector_proto = accumulator.Tensors('vector')[0].tensor_proto vector = tensor_util.MakeNdarray(vector_proto) string_proto = accumulator.Tensors('string')[0].tensor_proto string = tensor_util.MakeNdarray(string_proto) self.assertTrue(np.array_equal(scalar, 1.0)) self.assertTrue(np.array_equal(vector, [1.0, 2.0, 3.0])) self.assertTrue(np.array_equal(string, six.b('foobar')))
def set_model(self, model): self.model = model self.sess = K.get_session() if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: tf_summary.histogram(weight.name, weight) if self.write_images: w_img = array_ops.squeeze(weight) shape = w_img.get_shape() if len(shape) > 1 and shape[0] > shape[1]: w_img = array_ops.transpose(w_img) if len(shape) == 1: w_img = array_ops.expand_dims(w_img, 0) w_img = array_ops.expand_dims(array_ops.expand_dims(w_img, 0), -1) tf_summary.image(weight.name, w_img) if hasattr(layer, 'output'): tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph) else: self.writer = tf_summary.FileWriter(self.log_dir)
def testMergeAllSummaries(self): with ops.Graph().as_default(): const = constant_op.constant(10.0) summ1 = summary.histogram("h", const) summ2 = summary.scalar("o", const, collections=["foo_key"]) summ3 = summary.scalar("c", const) merge = summary.merge_all() self.assertEqual("MergeSummary", merge.op.type) self.assertEqual(2, len(merge.op.inputs)) self.assertEqual(summ1, merge.op.inputs[0]) self.assertEqual(summ3, merge.op.inputs[1]) merge = summary.merge_all("foo_key") self.assertEqual("MergeSummary", merge.op.type) self.assertEqual(1, len(merge.op.inputs)) self.assertEqual(summ2, merge.op.inputs[0]) self.assertTrue(summary.merge_all("bar_key") is None)
def testTFSummaryImage(self): """Verify processing of tf.summary.image.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) writer = SummaryToEventTransformer(event_sink) with self.test_session() as sess: ipt = array_ops.ones([10, 4, 4, 3], dtypes.uint8) # This is an interesting example, because the old tf.image_summary op # would throw an error here, because it would be tag reuse. # Using the tf node name instead allows argument re-use to the image # summary. with ops.name_scope('1'): summary_lib.image('images', ipt, max_outputs=1) with ops.name_scope('2'): summary_lib.image('images', ipt, max_outputs=2) with ops.name_scope('3'): summary_lib.image('images', ipt, max_outputs=3) merged = summary_lib.merge_all() writer.add_graph(sess.graph) for i in xrange(10): summ = sess.run(merged) writer.add_summary(summ, global_step=i) accumulator = ea.EventAccumulator(event_sink) accumulator.Reload() tags = [ u'1/images/image', u'2/images/image/0', u'2/images/image/1', u'3/images/image/0', u'3/images/image/1', u'3/images/image/2' ] self.assertTagsEqual(accumulator.Tags(), { ea.IMAGES: tags, ea.GRAPH: True, ea.META_GRAPH: False, })
def _summary_computed(): with ops.Graph().as_default(): sv = supervisor.Supervisor(is_chief=False) sess = sv.prepare_or_wait_for_session("") summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summ = summary.merge_all() sv.summary_computed(sess, sess.run(summ))
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self.sess = K.get_session() # only make histogram summary op if it hasn't already been made if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf_summary.histogram(mapped_weight_name, weight) if self.write_images: w_img = array_ops.squeeze(weight) shape = K.int_shape(w_img) if len(shape) == 2: # dense layer kernel case if shape[0] > shape[1]: w_img = array_ops.transpose(w_img) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1]) elif len(shape) == 3: # convnet case if K.image_data_format() == 'channels_last': # switch to channels_first to display # every kernel as a separate image w_img = array_ops.transpose(w_img, perm=[2, 0, 1]) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1]) elif len(shape) == 1: # bias case w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1]) else: # not possible to handle 3D convnets etc. continue shape = K.int_shape(w_img) assert len(shape) == 4 and shape[-1] in [1, 3, 4] tf_summary.image(mapped_weight_name, w_img) if self.write_grads: for weight in layer.trainable_weights: mapped_weight_name = weight.name.replace(':', '_') grads = model.optimizer.get_gradients(model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [grad.values if is_indexed_slices(grad) else grad for grad in grads] tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads) if hasattr(layer, 'output'): tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = self._writer_class(self.log_dir, self.sess.graph) else: self.writer = self._writer_class(self.log_dir)
def testNoLogdirButWantSummary(self): with ops.Graph().as_default(): summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(logdir="", summary_op=None) sess = sv.prepare_or_wait_for_session("") with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"): sv.summary_computed(sess, sess.run(summ))
def testChiefCanWriteEvents(self): logdir = _test_dir("can_write") with ops.Graph().as_default(): summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(is_chief=True, logdir=logdir, summary_op=None) meta_graph_def = meta_graph.create_meta_graph_def() sess = sv.prepare_or_wait_for_session("") sv.summary_computed(sess, sess.run(summ)) sess.close() # Wait to make sure everything is written to file before stopping. time.sleep(1) sv.stop() rr = _summary_iterator(logdir) # The first event should list the file_version. ev = next(rr) self.assertEquals("brain.Event:2", ev.file_version) # The next one has the graph. ev = next(rr) ev_graph = graph_pb2.GraphDef() ev_graph.ParseFromString(ev.graph_def) self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True), ev_graph) # Stored MetaGraphDef ev = next(rr) ev_meta_graph = meta_graph_pb2.MetaGraphDef() ev_meta_graph.ParseFromString(ev.meta_graph_def) self.assertProtoEquals(meta_graph_def, ev_meta_graph) self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True), ev_meta_graph.graph_def) # The next one should have the values from the summary. ev = next(rr) self.assertProtoEquals( """ value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # The next one should be a stop message if we closed cleanly. ev = next(rr) self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status) # We should be done. self.assertRaises(StopIteration, lambda: next(rr))
def testManagedSessionKeepSummaryWriter(self): logdir = self._test_dir("managed_keep_summary_writer") with ops.Graph().as_default(): summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(logdir=logdir) with sv.managed_session("", close_summary_writer=False, start_standard_services=False) as sess: sv.summary_computed(sess, sess.run(summ)) with sv.managed_session("", close_summary_writer=False, start_standard_services=False) as sess: sv.summary_computed(sess, sess.run(summ)) # Now close the summary writer to flush the events. sv.summary_writer.close() # The summary iterator should report the summary twice as we reused # the same summary writer across the 2 sessions. rr = _summary_iterator(logdir) # The first event should list the file_version. ev = next(rr) self.assertEquals("brain.Event:2", ev.file_version) # The next one has the graph. ev = next(rr) self.assertTrue(ev.graph_def) ev = next(rr) self.assertTrue(ev.meta_graph_def) # The next one should have the values from the summary. ev = next(rr) self.assertProtoEquals( """ value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # The next one should also have the values from the summary. ev = next(rr) self.assertProtoEquals( """ value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # We should be done. self.assertRaises(StopIteration, lambda: next(rr))
def testNoLogdirButExplicitSummaryWriter(self): logdir = self._test_dir("explicit_summary_writer") with ops.Graph().as_default(): summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sw = writer.FileWriter(logdir) sv = supervisor.Supervisor(logdir="", summary_op=None, summary_writer=sw) meta_graph_def = meta_graph.create_meta_graph_def() sess = sv.prepare_or_wait_for_session("") sv.summary_computed(sess, sess.run(summ)) sess.close() # Wait to make sure everything is written to file before stopping. time.sleep(1) sv.stop() # Check the summary was written to 'logdir' rr = _summary_iterator(logdir) # The first event should list the file_version. ev = next(rr) self.assertEquals("brain.Event:2", ev.file_version) # The next one has the graph. ev = next(rr) ev_graph = graph_pb2.GraphDef() ev_graph.ParseFromString(ev.graph_def) self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True), ev_graph) # Stored MetaGraphDef ev = next(rr) ev_meta_graph = meta_graph_pb2.MetaGraphDef() ev_meta_graph.ParseFromString(ev.meta_graph_def) self.assertProtoEquals(meta_graph_def, ev_meta_graph) self.assertProtoEquals( sess.graph.as_graph_def(add_shapes=True), ev_meta_graph.graph_def) # The next one should have the values from the summary. ev = next(rr) self.assertProtoEquals(""" value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # The next one should be a stop message if we closed cleanly. ev = next(rr) self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status) # We should be done. self.assertRaises(StopIteration, lambda: next(rr))
def testManagedSessionDoNotKeepSummaryWriter(self): logdir = self._test_dir("managed_not_keep_summary_writer") with ops.Graph().as_default(): summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(logdir=logdir, summary_op=None) with sv.managed_session( "", close_summary_writer=True, start_standard_services=False) as sess: sv.summary_computed(sess, sess.run(summ)) # Sleep 1.2s to make sure that the next event file has a different name # than the current one. time.sleep(1.2) with sv.managed_session( "", close_summary_writer=True, start_standard_services=False) as sess: sv.summary_computed(sess, sess.run(summ)) event_paths = sorted(glob.glob(os.path.join(logdir, "event*"))) self.assertEquals(2, len(event_paths)) # The two event files should have the same contents. for path in event_paths: # The summary iterator should report the summary once as we closed the # summary writer across the 2 sessions. rr = summary_iterator.summary_iterator(path) # The first event should list the file_version. ev = next(rr) self.assertEquals("brain.Event:2", ev.file_version) # The next one has the graph and metagraph. ev = next(rr) self.assertTrue(ev.graph_def) ev = next(rr) self.assertTrue(ev.meta_graph_def) # The next one should have the values from the summary. # But only once. ev = next(rr) self.assertProtoEquals(""" value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # The next one should be a stop message if we closed cleanly. ev = next(rr) self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status) # We should be done. with self.assertRaises(StopIteration): next(rr)
def set_model(self, model): self.model = model self.sess = K.get_session() if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf_summary.histogram(mapped_weight_name, weight) if self.write_grads: grads = model.optimizer.get_gradients(model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [grad.values if is_indexed_slices(grad) else grad for grad in grads] tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads) if self.write_images: w_img = array_ops.squeeze(weight) shape = K.int_shape(w_img) if len(shape) == 2: # dense layer kernel case if shape[0] > shape[1]: w_img = array_ops.transpose(w_img) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1]) elif len(shape) == 3: # convnet case if K.image_data_format() == 'channels_last': # switch to channels_first to display # every kernel as a separate image w_img = array_ops.transpose(w_img, perm=[2, 0, 1]) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1]) elif len(shape) == 1: # bias case w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1]) else: # not possible to handle 3D convnets etc. continue shape = K.int_shape(w_img) assert len(shape) == 4 and shape[-1] in [1, 3, 4] tf_summary.image(mapped_weight_name, w_img) if hasattr(layer, 'output'): tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph) else: self.writer = tf_summary.FileWriter(self.log_dir)
def _init_summary_op(self, summary_op=USE_DEFAULT): """Initializes summary_op. Args: summary_op: An Operation that returns a Summary for the event logs. If set to USE_DEFAULT, create an op that merges all the summaries. """ if summary_op is Supervisor.USE_DEFAULT: summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP) if summary_op is None: summary_op = _summary.merge_all() if summary_op is not None: ops.add_to_collection(ops.GraphKeys.SUMMARY_OP, summary_op) self._summary_op = summary_op
def testManagedSessionKeepSummaryWriter(self): logdir = self._test_dir("managed_keep_summary_writer") with ops.Graph().as_default(): summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(logdir=logdir) with sv.managed_session( "", close_summary_writer=False, start_standard_services=False) as sess: sv.summary_computed(sess, sess.run(summ)) with sv.managed_session( "", close_summary_writer=False, start_standard_services=False) as sess: sv.summary_computed(sess, sess.run(summ)) # Now close the summary writer to flush the events. sv.summary_writer.close() # The summary iterator should report the summary twice as we reused # the same summary writer across the 2 sessions. rr = _summary_iterator(logdir) # The first event should list the file_version. ev = next(rr) self.assertEquals("brain.Event:2", ev.file_version) # The next one has the graph. ev = next(rr) self.assertTrue(ev.graph_def) ev = next(rr) self.assertTrue(ev.meta_graph_def) # The next one should have the values from the summary. ev = next(rr) self.assertProtoEquals(""" value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # The next one should also have the values from the summary. ev = next(rr) self.assertProtoEquals(""" value { tag: 'c1' simple_value: 1.0 } value { tag: 'c2' simple_value: 2.0 } value { tag: 'c3' simple_value: 3.0 } """, ev.summary) # We should be done. self.assertRaises(StopIteration, lambda: next(rr))
def testLogdirButExplicitlyNoSummaryWriter(self): logdir = self._test_dir("explicit_no_summary_writer") with ops.Graph().as_default(): variables.VariableV1([1.0], name="foo") summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(logdir=logdir, summary_writer=None) sess = sv.prepare_or_wait_for_session("") # Check that a checkpoint is still be generated. self._wait_for_glob(sv.save_path, 3.0) # Check that we cannot write a summary with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"): sv.summary_computed(sess, sess.run(summ))
def testLogdirButExplicitlyNoSummaryWriter(self): logdir = self._test_dir("explicit_no_summary_writer") with ops.Graph().as_default(): variables.Variable([1.0], name="foo") summary.scalar("c1", constant_op.constant(1)) summary.scalar("c2", constant_op.constant(2)) summary.scalar("c3", constant_op.constant(3)) summ = summary.merge_all() sv = supervisor.Supervisor(logdir=logdir, summary_writer=None) sess = sv.prepare_or_wait_for_session("") # Check that a checkpoint is still be generated. self._wait_for_glob(sv.save_path, 3.0) # Check that we cannot write a summary with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"): sv.summary_computed(sess, sess.run(summ))
def set_model(self, model): """ Overwrite set_model method """ super().set_model(model) if self.write_input: input_imgs = self.model.input assert len( K.int_shape(input_imgs) ) == 4, 'Should be the 4-D images tensor [batch_size,height,width,channels]' tf.summary.image('Input_Image', input_imgs, max_outputs=self.max_result_display) self.merged = tf_summary.merge_all()
def verify_scalar_summary_is_written(self, print_summary): value = 3 tensor = array_ops.ones([]) * value name = 'my_score' prefix = 'eval' summaries.add_scalar_summary(tensor, name, prefix, print_summary) output_dir = tempfile.mkdtemp('scalar_summary_no_print_test') summary_op = summary.merge_all() summary_writer = summary.FileWriter(output_dir) with self.cached_session() as sess: new_summary = sess.run(summary_op) summary_writer.add_summary(new_summary, 1) summary_writer.flush() self.assert_scalar_summary(output_dir, {'%s/%s' % (prefix, name): value})
def testTrainWithNoneAsLogdirWhenUsingSummariesRaisesError(self): with ops.Graph().as_default(): random_seed.set_random_seed(0) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = LogisticClassifier(tf_inputs) loss_ops.log_loss(tf_predictions, tf_labels) total_loss = loss_ops.get_total_loss() summary.scalar('total_loss', total_loss) optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = learning.create_train_op(total_loss, optimizer) summary_op = summary.merge_all() with self.assertRaises(ValueError): learning.train( train_op, None, number_of_steps=300, summary_op=summary_op)
def testTFSummaryImage(self): """Verify processing of tf.summary.image.""" event_sink = _EventGenerator(zero_out_timestamps=True) writer = SummaryToEventTransformer(event_sink) with self.test_session() as sess: ipt = array_ops.ones([10, 4, 4, 3], dtypes.uint8) # This is an interesting example, because the old tf.image_summary op # would throw an error here, because it would be tag reuse. # Using the tf node name instead allows argument re-use to the image # summary. with ops.name_scope('1'): summary_lib.image('images', ipt, max_outputs=1) with ops.name_scope('2'): summary_lib.image('images', ipt, max_outputs=2) with ops.name_scope('3'): summary_lib.image('images', ipt, max_outputs=3) merged = summary_lib.merge_all() writer.add_graph(sess.graph) for i in xrange(10): summ = sess.run(merged) writer.add_summary(summ, global_step=i) accumulator = ea.EventAccumulator(event_sink) accumulator.Reload() tags = [ u'1/images/image', u'2/images/image/0', u'2/images/image/1', u'3/images/image/0', u'3/images/image/1', u'3/images/image/2' ] self.assertTagsEqual( accumulator.Tags(), { ea.IMAGES: tags, ea.AUDIO: [], ea.SCALARS: [], ea.HISTOGRAMS: [], ea.COMPRESSED_HISTOGRAMS: [], ea.GRAPH: True, ea.META_GRAPH: False, ea.RUN_METADATA: [] })
def testTFSummaryScalar(self): """Verify processing of tf.summary.scalar, which uses TensorSummary op.""" event_sink = _EventGenerator(zero_out_timestamps=True) writer = SummaryToEventTransformer(event_sink) with self.test_session() as sess: ipt = array_ops.placeholder(dtypes.float32) summary_lib.scalar('scalar1', ipt) summary_lib.scalar('scalar2', ipt * ipt) merged = summary_lib.merge_all() writer.add_graph(sess.graph) for i in xrange(10): summ = sess.run(merged, feed_dict={ipt: i}) writer.add_summary(summ, global_step=i) accumulator = ea.EventAccumulator(event_sink) accumulator.Reload() seq1 = [ ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10) ] seq2 = [ ea.ScalarEvent(wall_time=0, step=i, value=i * i) for i in xrange(10) ] self.assertTagsEqual( accumulator.Tags(), { ea.IMAGES: [], ea.AUDIO: [], ea.SCALARS: ['scalar1', 'scalar2'], ea.HISTOGRAMS: [], ea.COMPRESSED_HISTOGRAMS: [], ea.GRAPH: True, ea.META_GRAPH: False, ea.RUN_METADATA: [] }) self.assertEqual(accumulator.Scalars('scalar1'), seq1) self.assertEqual(accumulator.Scalars('scalar2'), seq2) first_value = accumulator.Scalars('scalar1')[0].value self.assertTrue(isinstance(first_value, float))
def verify_scalar_summary_is_written(self, print_summary): value = 3 tensor = array_ops.ones([]) * value name = 'my_score' prefix = 'eval' summaries.add_scalar_summary(tensor, name, prefix, print_summary) output_dir = os.path.join(self.get_temp_dir(), 'scalar_summary_no_print_test') self.safe_create(output_dir) summary_op = summary.merge_all() summary_writer = summary.FileWriter(output_dir) with self.cached_session() as sess: new_summary = sess.run(summary_op) summary_writer.add_summary(new_summary, 1) summary_writer.flush() self.assert_scalar_summary(output_dir, { '%s/%s' % (prefix, name): value })
def testTFSummaryScalar(self): """Verify processing of summary_lib.scalar.""" event_sink = _EventGenerator(self, zero_out_timestamps=True) writer = FileWriter(self.get_temp_dir()) writer.event_writer = event_sink with self.test_session() as sess: ipt = array_ops.placeholder(dtypes.float32) summary_lib.scalar('scalar1', ipt) summary_lib.scalar('scalar2', ipt * ipt) merged = summary_lib.merge_all() writer.add_graph(sess.graph) for i in xrange(10): summ = sess.run(merged, feed_dict={ipt: i}) writer.add_summary(summ, global_step=i) accumulator = ea.EventAccumulator(event_sink) accumulator.Reload() seq1 = [ ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10) ] seq2 = [ ea.ScalarEvent(wall_time=0, step=i, value=i * i) for i in xrange(10) ] self.assertTagsEqual( accumulator.Tags(), { ea.SCALARS: ['scalar1', 'scalar2'], ea.GRAPH: True, ea.META_GRAPH: False, }) self.assertEqual(accumulator.Scalars('scalar1'), seq1) self.assertEqual(accumulator.Scalars('scalar2'), seq2) first_value = accumulator.Scalars('scalar1')[0].value self.assertTrue(isinstance(first_value, float))
def generate_run(self, run_name): if run_name == self._RUN_WITH_HISTOGRAM: (use_histogram, use_scalars) = (True, False) elif run_name == self._RUN_WITH_SCALARS: (use_histogram, use_scalars) = (False, True) else: assert False, 'Invalid run name: %r' % run_name ops.reset_default_graph() sess = session.Session() placeholder = array_ops.placeholder(dtypes.float32, shape=[3]) if use_histogram: summary.histogram(self._HISTOGRAM_TAG, placeholder) if use_scalars: summary.scalar(self._SCALAR_TAG, math_ops.reduce_mean(placeholder)) summ = summary.merge_all() subdir = os.path.join(self.logdir, run_name) writer = summary.FileWriter(subdir) writer.add_graph(sess.graph) for step in xrange(self._STEPS): feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]} s = sess.run(summ, feed_dict=feed_dict) writer.add_summary(s, global_step=step) writer.close()
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, session_wrapper=None, trace_every_n_steps=None, batch_size=1, num_examples=None, config_summary_list=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training, as measured by 'global_step': training will stop if global_step is greater than 'number_of_steps'. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of them. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. session_wrapper: A function that takes a `tf.Session` object as the only argument and returns a wrapped session object that has the same methods that the original object has, or `None`. Iff not `None`, the wrapped object will be used for training. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. batch_size: batch size. num_examples: The number of examples in dataset for training. dubug_tensors: Additional tensors to run for debugging. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if not isinstance(train_op, list): train_op = [train_op] # Allocate log function to each step. log_fn_list = [log.info, log.infov] def _iter_log_fn(): for log_fn in log_fn_list: yield log_fn it = itertools.cycle(_iter_log_fn()) current_log_fn = it.next() if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): sync_optimizer = [sync_optimizer] if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = training_util.get_or_create_global_step() saver = saver or tf_saver.Saver() if sync_optimizer is not None: for opt in sync_optimizer: if not isinstance( opt, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( variables.local_variables_initializer(), lookup_ops.tables_initializer()) if sync_optimizer is not None and isinstance(sync_optimizer, list): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = control_flow_ops.group( *[opt.chief_init_op for opt in sync_optimizer]) else: local_init_op = control_flow_ops.group( * [opt.local_step_init_op for opt in sync_optimizer]) ready_for_local_init_op = control_flow_ops.group( *[opt.ready_for_local_init_op for opt in sync_optimizer]) else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = [ opt.get_init_tokens_op() for opt in sync_optimizer ] chief_queue_runner = [ opt.get_chief_queue_runner() for opt in sync_optimizer ] if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer steps_in_epoch = int(num_examples / batch_size) total_loss = 0.0 should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: log.infov('Starting Session.') if session_wrapper is not None: log.info('Wrapping session with wrapper function: %s', session_wrapper) sess = session_wrapper(sess) if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) threads = sv.start_queue_runners(sess) log.infov('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, chief_queue_runner) sess.run(init_tokens_op) sess.graph.finalize() # try: if config_summary_list is not None: for config_summary in config_summary_list: sv.summary_writer.add_summary( config_summary.eval(session=sess)) while not sv.should_stop(): for _train_op in train_op: total_loss, should_stop, np_global_step = train_step_fn( sess, _train_op, global_step, train_step_kwargs, batch_size, steps_in_epoch, current_log_fn) if should_stop: log.infov('Stopping Training.') sv.request_stop() break # except errors.OutOfRangeError: # # OutOfRangeError is thrown when epoch limit per # # tf.train.limit_epochs is reached. # log.warn('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: log.warn('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) sv.stop(threads, close_summary_writer=True) def _last_checkpoint_path(sv_save_path, additional_dir_name='last'): dir_list = sv_save_path.split('/') dir_list.insert(-1, 'last') last_checkpoint_dir_path = '/'.join(dir_list[:-1]) last_checkpoint_path = '/'.join(dir_list) return last_checkpoint_dir_path, last_checkpoint_path # Save the last checkpoint again to a 'last' directory for the next training with # different configuration. last_checkpoint_dir_path, last_checkpoint_path = _last_checkpoint_path( sv.save_path, 'last') if os.path.exists(last_checkpoint_dir_path): shutil.rmtree(last_checkpoint_dir_path) os.makedirs(last_checkpoint_dir_path) sv.saver.save(sess, last_checkpoint_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. log.warn('Retrying training!') should_retry = True return total_loss
def set_model(self, model): self.model = model self.sess = K.get_session() if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf_summary.histogram(mapped_weight_name, weight) if self.write_grads: grads = model.optimizer.get_gradients(model.total_loss, weight) tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads) if self.write_images: w_img = array_ops.squeeze(weight) shape = K.int_shape(w_img) if len(shape) == 2: # dense layer kernel case if shape[0] > shape[1]: w_img = array_ops.transpose(w_img) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1]) elif len(shape) == 3: # convnet case if K.image_data_format() == 'channels_last': # switch to channels_first to display # every kernel as a separate image w_img = array_ops.transpose(w_img, perm=[2, 0, 1]) shape = K.int_shape(w_img) w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1]) elif len(shape) == 1: # bias case w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1]) else: # not possible to handle 3D convnets etc. continue shape = K.int_shape(w_img) assert len(shape) == 4 and shape[-1] in [1, 3, 4] tf_summary.image(mapped_weight_name, w_img) if hasattr(layer, 'output'): tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph) else: self.writer = tf_summary.FileWriter(self.log_dir) if self.embeddings_freq: embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] embeddings = { layer.name: layer.weights[0] for layer in self.model.layers if layer.name in embeddings_layer_names } self.saver = saver_lib.Saver(list(embeddings.values())) embeddings_metadata = {} if not isinstance(self.embeddings_metadata, str): embeddings_metadata = self.embeddings_metadata else: embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings.keys() } config = projector.ProjectorConfig() self.embeddings_ckpt_path = os.path.join(self.log_dir, 'keras_embedding.ckpt') for layer_name, tensor in embeddings.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name if layer_name in embeddings_metadata: embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def begin(self): self._summary_op = summary_lib.merge_all()
def evaluate_once(master, checkpoint_path, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, session_config=None, hooks=None): """Evaluates the model at the given checkpoint path. Args: master: The BNS address of the TensorFlow master. checkpoint_path: The path to a checkpoint to use for evaluation. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.compat.v1.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. session_config: An instance of `tf.compat.v1.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. hooks: A list of additional `SessionRunHook` objects to pass during the evaluation. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() all_hooks = [ evaluation.StopAfterNEvalsHook(num_evals), ] if summary_op is not None: all_hooks.append( evaluation.SummaryAtEndHook(log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict)) if hooks is not None: all_hooks.extend(hooks) saver = None if variables_to_restore is not None: saver = tf_saver.Saver(variables_to_restore) return evaluation.evaluate_once(checkpoint_path, master=master, scaffold=monitored_session.Scaffold( init_op=initial_op, init_feed_dict=initial_op_feed_dict, saver=saver), eval_ops=eval_op, feed_dict=eval_op_feed_dict, final_ops=final_op, final_ops_feed_dict=final_op_feed_dict, hooks=all_hooks, config=session_config)
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, init_fn=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None, timeout_fn=None, hooks=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.compat.v1.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.compat.v1.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. timeout_fn: Optional function to call after a timeout. If the function returns True, then it means that no new checkpoints will be generated and the iterator will exit. The function is called with no arguments. hooks: A list of additional `SessionRunHook` objects to pass during repeated evaluations. Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() all_hooks = [ evaluation.StopAfterNEvalsHook(num_evals), ] if summary_op is not None: all_hooks.append( evaluation.SummaryAtEndHook(log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict)) if hooks is not None: # Add custom hooks if provided. all_hooks.extend(hooks) saver = None if variables_to_restore is not None: saver = tf_saver.Saver(variables_to_restore) return evaluation.evaluate_repeatedly( checkpoint_dir, master=master, scaffold=monitored_session.Scaffold( init_op=initial_op, init_feed_dict=initial_op_feed_dict, init_fn=init_fn, saver=saver), eval_ops=eval_op, feed_dict=eval_op_feed_dict, final_ops=final_op, final_ops_feed_dict=final_op_feed_dict, eval_interval_secs=eval_interval_secs, hooks=all_hooks, config=session_config, max_number_of_evaluations=max_number_of_evaluations, timeout=timeout, timeout_fn=timeout_fn)
def begin(self): if self._replace_summary_op: self._summary_op = summary.merge_all() self._global_step = variables.get_or_create_global_step()
def test_add_image_comparison_summaries_for_cyclegan(self): summaries.add_cyclegan_image_summaries(get_cyclegan_model()) self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): summary.merge_all().eval()
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self.sess = K.get_session() # only make histogram summary op if it hasn't already been made if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf_summary.histogram(mapped_weight_name, weight) if self.write_images: w_img = array_ops.squeeze(weight) shape = K.int_shape(w_img) if len(shape) == 2: # dense layer kernel case if shape[0] > shape[1]: w_img = array_ops.transpose(w_img) shape = K.int_shape(w_img) w_img = array_ops.reshape( w_img, [1, shape[0], shape[1], 1]) elif len(shape) == 3: # convnet case if K.image_data_format() == 'channels_last': # switch to channels_first to display # every kernel as a separate image w_img = array_ops.transpose(w_img, perm=[2, 0, 1]) shape = K.int_shape(w_img) w_img = array_ops.reshape( w_img, [shape[0], shape[1], shape[2], 1]) elif len(shape) == 1: # bias case w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1]) else: # not possible to handle 3D convnets etc. continue shape = K.int_shape(w_img) assert len(shape) == 4 and shape[-1] in [1, 3, 4] tf_summary.image(mapped_weight_name, w_img) if self.write_grads: for weight in layer.trainable_weights: mapped_weight_name = weight.name.replace(':', '_') grads = model.optimizer.get_gradients( model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [ grad.values if is_indexed_slices(grad) else grad for grad in grads ] tf_summary.histogram( '{}_grad'.format(mapped_weight_name), grads) if hasattr(layer, 'output'): if isinstance(layer.output, list): for i, output in enumerate(layer.output): tf_summary.histogram( '{}_out_{}'.format(layer.name, i), output) else: tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = self._writer_class(self.log_dir, self.sess.graph) else: self.writer = self._writer_class(self.log_dir) # If both embedding_freq and embeddings_data are available, we will # visualize embeddings. if self.embeddings_freq and self.embeddings_data is not None: self.embeddings_data = standardize_input_data( self.embeddings_data, model.input_names) # If embedding_layer_names are not provided, get all of the embedding # layers from the model. embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] self.assign_embeddings = [] embeddings_vars = {} self.batch_id = batch_id = array_ops.placeholder(dtypes.int32) self.step = step = array_ops.placeholder(dtypes.int32) for layer in self.model.layers: if layer.name in embeddings_layer_names: embedding_input = self.model.get_layer(layer.name).output embedding_size = np.prod(embedding_input.shape[1:]) embedding_input = array_ops.reshape( embedding_input, (step, int(embedding_size))) shape = (self.embeddings_data[0].shape[0], int(embedding_size)) embedding = variables.Variable(array_ops.zeros(shape), name=layer.name + '_embedding') embeddings_vars[layer.name] = embedding batch = state_ops.assign( embedding[batch_id:batch_id + step], embedding_input) self.assign_embeddings.append(batch) self.saver = saver.Saver(list(embeddings_vars.values())) # Create embeddings_metadata dictionary if isinstance(self.embeddings_metadata, str): embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings_vars.keys() } else: # If embedding_metadata is already a dictionary embeddings_metadata = self.embeddings_metadata try: from tensorboard.plugins import projector except ImportError: raise ImportError( 'Failed to import TensorBoard. Please make sure that ' 'TensorBoard integration is complete."') # TODO(psv): Add integration tests to test embedding visualization # with TensorBoard callback. We are unable to write a unit test for this # because TensorBoard dependency assumes TensorFlow package is installed. config = projector.ProjectorConfig() for layer_name, tensor in embeddings_vars.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name if (embeddings_metadata is not None and layer_name in embeddings_metadata): embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def begin(self): if self._summary_op is None: self._summary_op = summary.merge_all()
def set_model(self, model): self.model = model self.sess = K.get_session() if self.histogram_freq and self.merged is None: for layer in self.model.layers: for weight in layer.weights: tf_summary.histogram(weight.name, weight) if self.write_images: w_img = array_ops.squeeze(weight) shape = w_img.get_shape() if len(shape) > 1 and shape[0] > shape[1]: w_img = array_ops.transpose(w_img) if len(shape) == 1: w_img = array_ops.expand_dims(w_img, 0) w_img = array_ops.expand_dims( array_ops.expand_dims(w_img, 0), -1) tf_summary.image(weight.name, w_img) if hasattr(layer, 'output'): tf_summary.histogram('{}_out'.format(layer.name), layer.output) self.merged = tf_summary.merge_all() if self.write_graph: self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph) else: self.writer = tf_summary.FileWriter(self.log_dir) if self.embeddings_freq: self.saver = saver_lib.Saver() embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] embeddings = { layer.name: layer.weights[0] for layer in self.model.layers if layer.name in embeddings_layer_names } embeddings_metadata = {} if not isinstance(self.embeddings_metadata, str): embeddings_metadata = self.embeddings_metadata else: embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings.keys() } config = projector.ProjectorConfig() self.embeddings_logs = [] for layer_name, tensor in embeddings.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name self.embeddings_logs.append( os.path.join(self.log_dir, layer_name + '.ckpt')) if layer_name in embeddings_metadata: embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def begin(self): if self._replace_summary_op: # This can still remain None if there are no summaries. self._summary_op = summary.merge_all() self._global_step = training_util.get_or_create_global_step()
def build_controller(self): """RL optimization interface. Returns: ops: A dictionary holding handles of the model used for training. """ self._global_step = training_util.get_or_create_global_step() ops = {} ops["loss"] = 0 failing_signal = self.compute_reward(self.hparams.failing_signal) ctr = {} with tf_ops.name_scope("controller_{}".format(self.ctrl_id)): with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)): ctr["reward"] = {"value": [], "ph": [], "update": []} ctr["ready"] = {"value": [], "ph": [], "update": []} ctr["best_reward"] = {"value": [], "update": []} for i in range(self.hparams.num_children): reward_value = variable_scope.get_local_variable( "reward_{}".format(i), initializer=0.0, dtype=dtypes.float32, trainable=False) reward_ph = array_ops.placeholder( dtypes.float32, shape=(), name="reward_ph_{}".format(i)) reward_update = state_ops.assign( reward_value, reward_ph, use_locking=True) ctr["reward"]["value"].append(reward_value) ctr["reward"]["ph"].append(reward_ph) ctr["reward"]["update"].append(reward_update) best_reward = variable_scope.get_local_variable( "best_reward_{}".format(i), initializer=failing_signal, dtype=dtypes.float32, trainable=False) ctr["best_reward"]["value"].append(best_reward) ctr["best_reward"]["update"].append( state_ops.assign(best_reward, math_ops.minimum(best_reward, reward_update))) ready_value = variable_scope.get_local_variable( "ready_{}".format(i), initializer=True, dtype=dtypes.bool, trainable=False) ready_ph = array_ops.placeholder( dtypes.bool, shape=(), name="ready_ph_{}".format(i)) ready_update = state_ops.assign( ready_value, ready_ph, use_locking=True) ctr["ready"]["value"].append(ready_value) ctr["ready"]["ph"].append(ready_ph) ctr["ready"]["update"].append(ready_update) ctr["grouping_y_preds"], ctr["grouping_log_probs"] = self.get_groupings() summary.histogram( "grouping_actions", array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0], [1, array_ops.shape(self.op_embeddings)[0]])) with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)): ctr["baseline"] = variable_scope.get_local_variable( "baseline", initializer=failing_signal if self.hparams.start_with_failing_signal else 0.0, dtype=dtypes.float32, trainable=False) new_baseline = self.hparams.bl_dec * ctr["baseline"] + ( 1 - self.hparams.bl_dec) * math_ops.reduce_mean( ctr["reward"]["value"]) if not self.hparams.always_update_baseline: baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal) selected_reward = array_ops.boolean_mask(ctr["reward"]["value"], baseline_mask) selected_baseline = control_flow_ops.cond( math_ops.reduce_any(baseline_mask), lambda: math_ops.reduce_mean(selected_reward), lambda: constant_op.constant(0, dtype=dtypes.float32)) ctr["pos_reward"] = selected_baseline pos_ = math_ops.less( constant_op.constant(0, dtype=dtypes.float32), selected_baseline) selected_baseline = self.hparams.bl_dec * ctr["baseline"] + ( 1 - self.hparams.bl_dec) * selected_baseline selected_baseline = control_flow_ops.cond( pos_, lambda: selected_baseline, lambda: ctr["baseline"]) new_baseline = control_flow_ops.cond( math_ops.less(self.global_step, self.hparams.stop_updating_after_steps), lambda: new_baseline, lambda: selected_baseline) ctr["baseline_update"] = state_ops.assign( ctr["baseline"], new_baseline, use_locking=True) ctr["y_preds"], ctr["log_probs"] = self.get_placements() summary.histogram("actions", ctr["y_preds"]["sample"]) mask = math_ops.less(ctr["reward"]["value"], failing_signal) ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"] ctr["loss"] *= ( ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"]) selected_loss = array_ops.boolean_mask(ctr["loss"], mask) selected_loss = control_flow_ops.cond( math_ops.reduce_any(mask), lambda: math_ops.reduce_mean(-selected_loss), lambda: constant_op.constant(0, dtype=dtypes.float32)) ctr["loss"] = control_flow_ops.cond( math_ops.less(self.global_step, self.hparams.stop_updating_after_steps), lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss) ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"]) summary.scalar("loss", ctr["loss"]) summary.scalar("avg_reward", ctr["reward_s"]) summary.scalar("best_reward_so_far", best_reward) summary.scalar( "advantage", math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"])) with variable_scope.variable_scope( "optimizer", reuse=variable_scope.AUTO_REUSE): (ctr["train_op"], ctr["lr"], ctr["grad_norm"], ctr["grad_norms"]) = self._get_train_ops( ctr["loss"], tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES), self.global_step, grad_bound=self.hparams.grad_bound, lr_init=self.hparams.lr, lr_dec=self.hparams.lr_dec, start_decay_step=self.hparams.start_decay_step, decay_steps=self.hparams.decay_steps, optimizer_type=self.hparams.optimizer_type) summary.scalar("gradnorm", ctr["grad_norm"]) summary.scalar("lr", ctr["lr"]) ctr["summary"] = summary.merge_all() ops["controller"] = ctr self.ops = ops return ops
def testAddRegularizationLossSummaries(self): summaries.add_regularization_loss_summaries(get_gan_model()) self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES))) with self.test_session(use_gpu=True): summary.merge_all().eval()
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, session_wrapper=None, trace_every_n_steps=None, ignore_live_threads=False): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step are logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then training_util.get_or_create_global_step(), that is, tf.contrib.framework.global_step() is used. number_of_steps: The max number of gradient steps to take during training, as measured by 'global_step': training will stop if global_step is greater than 'number_of_steps'. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of them. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. session_wrapper: A function that takes a `tf.Session` object as the only argument and returns a wrapped session object that has the same methods that the original object has, or `None`. Iff not `None`, the wrapped object will be used for training. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. ignore_live_threads: If `True` ignores threads that remain running after a grace period when stopping the supervisor, instead of raising a RuntimeError. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): sync_optimizer = [sync_optimizer] if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = training_util.get_or_create_global_step() saver = saver or tf_saver.Saver() if sync_optimizer is not None: for opt in sync_optimizer: if not isinstance(opt, sync_replicas_optimizer.SyncReplicasOptimizer): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.') with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( variables.local_variables_initializer(), lookup_ops.tables_initializer()) if sync_optimizer is not None and isinstance(sync_optimizer, list): with ops.control_dependencies([local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = control_flow_ops.group( *[opt.chief_init_op for opt in sync_optimizer]) else: local_init_op = control_flow_ops.group( *[opt.local_step_init_op for opt in sync_optimizer]) ready_for_local_init_op = control_flow_ops.group( *[opt.ready_for_local_init_op for opt in sync_optimizer]) else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = [opt.get_init_tokens_op() for opt in sync_optimizer] chief_queue_runner = [ opt.get_chief_queue_runner() for opt in sync_optimizer] if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal(global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op if log_every_n_steps > 0: train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer total_loss = None should_retry = True while should_retry: try: should_retry = False with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if session_wrapper is not None: logging.info( 'Wrapping session with wrapper function: %s', session_wrapper) sess = session_wrapper(sess) if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: # (use sys.maxsize because sys.maxint doesn't exist in Python 3) _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxsize)) threads = sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, chief_queue_runner) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') sv.request_stop() break except errors.OutOfRangeError as e: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training. %s', e) if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) sv.stop( threads, close_summary_writer=True, ignore_live_threads=ignore_live_threads) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def evaluation_loop(master, checkpoint_dir, logdir, num_evals=1, initial_op=None, initial_op_feed_dict=None, eval_op=None, eval_op_feed_dict=None, final_op=None, final_op_feed_dict=None, summary_op=_USE_DEFAULT, summary_op_feed_dict=None, variables_to_restore=None, eval_interval_secs=60, max_number_of_evaluations=None, session_config=None, timeout=None, hooks=None): """Runs TF-Slim's Evaluation Loop. Args: master: The BNS address of the TensorFlow master. checkpoint_dir: The directory where checkpoints are stored. logdir: The directory where the TensorFlow summaries are written to. num_evals: The number of times to run `eval_op`. initial_op: An operation run at the beginning of evaluation. initial_op_feed_dict: A feed dictionary to use when executing `initial_op`. eval_op: A operation run `num_evals` times. eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`. final_op: An operation to execute after all of the `eval_op` executions. The value of `final_op` is returned. final_op_feed_dict: A feed dictionary to use when executing `final_op`. summary_op: The summary_op to evaluate after running TF-Slims metric ops. By default the summary_op is set to tf.summary.merge_all(). summary_op_feed_dict: An optional feed dictionary to use when running the `summary_op`. variables_to_restore: A list of TensorFlow variables to restore during evaluation. If the argument is left as `None` then slim.variables.GetVariablesToRestore() is used. eval_interval_secs: The minimum number of seconds between evaluations. max_number_of_evaluations: the max number of iterations of the evaluation. If the value is left as 'None', the evaluation continues indefinitely. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. timeout: The maximum amount of time to wait between checkpoints. If left as `None`, then the process will wait indefinitely. hooks: Hooks to use while evaluating Returns: The value of `final_op` or `None` if `final_op` is `None`. """ if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if hooks is None: hooks = [] hooks.append(evaluation.StopAfterNEvalsHook(num_evals)) if summary_op is not None: hooks.append(evaluation.SummaryAtEndHook( log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict)) saver = None if variables_to_restore is not None: saver = tf_saver.Saver(variables_to_restore) return evaluation.evaluate_repeatedly( checkpoint_dir, master=master, scaffold=monitored_session.Scaffold( init_op=initial_op, init_feed_dict=initial_op_feed_dict, saver=saver), eval_ops=eval_op, feed_dict=eval_op_feed_dict, final_ops=final_op, final_ops_feed_dict=final_op_feed_dict, eval_interval_secs=eval_interval_secs, hooks=hooks, config=session_config, max_number_of_evaluations=max_number_of_evaluations, timeout=timeout)
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self._init_writer(model) # histogram summaries only enabled in graph mode if not context.executing_eagerly(): self._make_histogram_ops(model) self.merged = tf_summary.merge_all() # If both embedding_freq and embeddings_data are available, we will # visualize embeddings. if self.embeddings_freq and self.embeddings_data is not None: # Avoid circular dependency. from tensorflow.python.keras.engine import training_utils # pylint: disable=g-import-not-at-top self.embeddings_data = training_utils.standardize_input_data( self.embeddings_data, model.input_names) # If embedding_layer_names are not provided, get all of the embedding # layers from the model. embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] self.assign_embeddings = [] embeddings_vars = {} self.batch_id = batch_id = array_ops.placeholder(dtypes.int32) self.step = step = array_ops.placeholder(dtypes.int32) for layer in self.model.layers: if layer.name in embeddings_layer_names: embedding_input = self.model.get_layer(layer.name).output embedding_size = np.prod(embedding_input.shape[1:]) embedding_input = array_ops.reshape( embedding_input, (step, int(embedding_size))) shape = (self.embeddings_data[0].shape[0], int(embedding_size)) embedding = variables.Variable(array_ops.zeros(shape), name=layer.name + '_embedding') embeddings_vars[layer.name] = embedding batch = state_ops.assign( embedding[batch_id:batch_id + step], embedding_input) self.assign_embeddings.append(batch) self.saver = saver.Saver(list(embeddings_vars.values())) # Create embeddings_metadata dictionary if isinstance(self.embeddings_metadata, str): embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings_vars.keys() } else: # If embedding_metadata is already a dictionary embeddings_metadata = self.embeddings_metadata try: from tensorboard.plugins import projector except ImportError: raise ImportError( 'Failed to import TensorBoard. Please make sure that ' 'TensorBoard integration is complete."') # TODO(psv): Add integration tests to test embedding visualization # with TensorBoard callback. We are unable to write a unit test for this # because TensorBoard dependency assumes TensorFlow package is installed. config = projector.ProjectorConfig() for layer_name, tensor in embeddings_vars.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name if (embeddings_metadata is not None and layer_name in embeddings_metadata): embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def build_controller(self): """RL optimization interface. Returns: ops: A dictionary holding handles of the model used for training. """ self._global_step = training_util.get_or_create_global_step() ops = {} ops["loss"] = 0 failing_signal = self.compute_reward(self.hparams.failing_signal) ctr = {} with tf_ops.name_scope("controller_{}".format(self.ctrl_id)): with variable_scope.variable_scope("controller_{}".format( self.ctrl_id)): ctr["reward"] = {"value": [], "ph": [], "update": []} ctr["ready"] = {"value": [], "ph": [], "update": []} ctr["best_reward"] = {"value": [], "update": []} for i in range(self.hparams.num_children): reward_value = variable_scope.get_local_variable( "reward_{}".format(i), initializer=0.0, dtype=dtypes.float32, trainable=False) reward_ph = array_ops.placeholder( dtypes.float32, shape=(), name="reward_ph_{}".format(i)) reward_update = state_ops.assign(reward_value, reward_ph, use_locking=True) ctr["reward"]["value"].append(reward_value) ctr["reward"]["ph"].append(reward_ph) ctr["reward"]["update"].append(reward_update) best_reward = variable_scope.get_local_variable( "best_reward_{}".format(i), initializer=failing_signal, dtype=dtypes.float32, trainable=False) ctr["best_reward"]["value"].append(best_reward) ctr["best_reward"]["update"].append( state_ops.assign( best_reward, math_ops.minimum(best_reward, reward_update))) ready_value = variable_scope.get_local_variable( "ready_{}".format(i), initializer=True, dtype=dtypes.bool, trainable=False) ready_ph = array_ops.placeholder( dtypes.bool, shape=(), name="ready_ph_{}".format(i)) ready_update = state_ops.assign(ready_value, ready_ph, use_locking=True) ctr["ready"]["value"].append(ready_value) ctr["ready"]["ph"].append(ready_ph) ctr["ready"]["update"].append(ready_update) ctr["grouping_y_preds"], ctr[ "grouping_log_probs"] = self.get_groupings() summary.histogram( "grouping_actions", array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0], [1, array_ops.shape(self.op_embeddings)[0]])) with variable_scope.variable_scope("controller_{}".format( self.ctrl_id)): ctr["baseline"] = variable_scope.get_local_variable( "baseline", initializer=failing_signal if self.hparams.start_with_failing_signal else 0.0, dtype=dtypes.float32, trainable=False) new_baseline = self.hparams.bl_dec * ctr["baseline"] + ( 1 - self.hparams.bl_dec) * math_ops.reduce_mean( ctr["reward"]["value"]) if not self.hparams.always_update_baseline: baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal) selected_reward = array_ops.boolean_mask( ctr["reward"]["value"], baseline_mask) selected_baseline = control_flow_ops.cond( math_ops.reduce_any(baseline_mask), lambda: math_ops.reduce_mean(selected_reward), lambda: constant_op.constant(0, dtype=dtypes.float32)) ctr["pos_reward"] = selected_baseline pos_ = math_ops.less( constant_op.constant(0, dtype=dtypes.float32), selected_baseline) selected_baseline = self.hparams.bl_dec * ctr["baseline"] + ( 1 - self.hparams.bl_dec) * selected_baseline selected_baseline = control_flow_ops.cond( pos_, lambda: selected_baseline, lambda: ctr["baseline"]) new_baseline = control_flow_ops.cond( math_ops.less(self.global_step, self.hparams.stop_updating_after_steps), lambda: new_baseline, lambda: selected_baseline) ctr["baseline_update"] = state_ops.assign(ctr["baseline"], new_baseline, use_locking=True) ctr["y_preds"], ctr["log_probs"] = self.get_placements() summary.histogram("actions", ctr["y_preds"]["sample"]) mask = math_ops.less(ctr["reward"]["value"], failing_signal) ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"] ctr["loss"] *= (ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"]) selected_loss = array_ops.boolean_mask(ctr["loss"], mask) selected_loss = control_flow_ops.cond( math_ops.reduce_any(mask), lambda: math_ops.reduce_mean(-selected_loss), lambda: constant_op.constant(0, dtype=dtypes.float32)) ctr["loss"] = control_flow_ops.cond( math_ops.less(self.global_step, self.hparams.stop_updating_after_steps), lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss) ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"]) summary.scalar("loss", ctr["loss"]) summary.scalar("avg_reward", ctr["reward_s"]) summary.scalar("best_reward_so_far", best_reward) summary.scalar( "advantage", math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"])) with variable_scope.variable_scope("optimizer", reuse=variable_scope.AUTO_REUSE): (ctr["train_op"], ctr["lr"], ctr["grad_norm"], ctr["grad_norms"]) = self._get_train_ops( ctr["loss"], tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES), self.global_step, grad_bound=self.hparams.grad_bound, lr_init=self.hparams.lr, lr_dec=self.hparams.lr_dec, start_decay_step=self.hparams.start_decay_step, decay_steps=self.hparams.decay_steps, optimizer_type=self.hparams.optimizer_type) summary.scalar("gradnorm", ctr["grad_norm"]) summary.scalar("lr", ctr["lr"]) ctr["summary"] = summary.merge_all() ops["controller"] = ctr self.ops = ops return ops
def train(train_op, logdir, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_steps=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training. If the value is left as None, training proceeds indefinitely. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.' ) if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), data_flow_ops.tables_initializer()) if sync_optimizer is not None and isinstance( sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): with ops.control_dependencies( [local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = sync_optimizer.chief_init_op else: local_init_op = sync_optimizer.local_step_init_op ready_for_local_init_op = sync_optimizer.ready_for_local_init_op else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: if not isinstance(sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.' ) # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal( global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir sv = supervisor.Supervisor(graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session(master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step( sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op) try: while not sv.should_stop(): total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def set_model(self, model): """Sets Keras model and creates summary ops.""" self.model = model self._init_writer(model) # histogram summaries only enabled in graph mode if not context.executing_eagerly(): self._make_histogram_ops(model) self.merged = tf_summary.merge_all() # If both embedding_freq and embeddings_data are available, we will # visualize embeddings. if self.embeddings_freq and self.embeddings_data is not None: # Avoid circular dependency. from tensorflow.python.keras.engine import training_utils # pylint: disable=g-import-not-at-top self.embeddings_data = training_utils.standardize_input_data( self.embeddings_data, model.input_names) # If embedding_layer_names are not provided, get all of the embedding # layers from the model. embeddings_layer_names = self.embeddings_layer_names if not embeddings_layer_names: embeddings_layer_names = [ layer.name for layer in self.model.layers if type(layer).__name__ == 'Embedding' ] self.assign_embeddings = [] embeddings_vars = {} self.batch_id = batch_id = array_ops.placeholder(dtypes.int32) self.step = step = array_ops.placeholder(dtypes.int32) for layer in self.model.layers: if layer.name in embeddings_layer_names: embedding_input = self.model.get_layer(layer.name).output embedding_size = np.prod(embedding_input.shape[1:]) embedding_input = array_ops.reshape(embedding_input, (step, int(embedding_size))) shape = (self.embeddings_data[0].shape[0], int(embedding_size)) embedding = variables.Variable( array_ops.zeros(shape), name=layer.name + '_embedding') embeddings_vars[layer.name] = embedding batch = state_ops.assign(embedding[batch_id:batch_id + step], embedding_input) self.assign_embeddings.append(batch) self.saver = saver.Saver(list(embeddings_vars.values())) # Create embeddings_metadata dictionary if isinstance(self.embeddings_metadata, str): embeddings_metadata = { layer_name: self.embeddings_metadata for layer_name in embeddings_vars.keys() } else: # If embedding_metadata is already a dictionary embeddings_metadata = self.embeddings_metadata try: from tensorboard.plugins import projector except ImportError: raise ImportError('Failed to import TensorBoard. Please make sure that ' 'TensorBoard integration is complete."') # TODO(psv): Add integration tests to test embedding visualization # with TensorBoard callback. We are unable to write a unit test for this # because TensorBoard dependency assumes TensorFlow package is installed. config = projector.ProjectorConfig() for layer_name, tensor in embeddings_vars.items(): embedding = config.embeddings.add() embedding.tensor_name = tensor.name if (embeddings_metadata is not None and layer_name in embeddings_metadata): embedding.metadata_path = embeddings_metadata[layer_name] projector.visualize_embeddings(self.writer, config)
def train(train_op, logdir, metric_op=None, metric_collection_name=None, train_step_fn=train_step, train_step_kwargs=_USE_DEFAULT, log_every_n_steps=1, graph=None, master='', is_chief=True, global_step=None, number_of_samples=None, number_of_steps=None, number_of_epochs=None, batch_size=None, init_op=_USE_DEFAULT, init_feed_dict=None, local_init_op=_USE_DEFAULT, init_fn=None, ready_op=_USE_DEFAULT, summary_op=_USE_DEFAULT, save_summaries_secs=600, summary_writer=_USE_DEFAULT, startup_delay_steps=0, saver=None, save_interval_secs=600, sync_optimizer=None, session_config=None, trace_every_n_steps=None): """Runs a training loop using a TensorFlow supervisor. When the sync_optimizer is supplied, gradient updates are applied synchronously. Otherwise, gradient updates are applied asynchronous. Args: train_op: A `Tensor` that, when executed, will apply the gradients and return the loss value. metric_op: A `Tensor` that, when executed, will update the streaming_metrics ops. metric_collection_name: The name associated with the metric_op. logdir: The directory where training logs are written to. If None, model checkpoints and summaries will not be written. train_step_fn: The function to call in order to execute a single gradient step. The function must have take exactly four arguments: the current session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary. train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By default, two `Boolean`, scalar ops called "should_stop" and "should_log" are provided. log_every_n_steps: The frequency, in terms of global steps, that the loss and global step and logged. graph: The graph to pass to the supervisor. If no graph is supplied the default graph is used. master: The address of the tensorflow master. is_chief: Specifies whether or not the training is being run by the primary replica during replica training. global_step: The `Tensor` representing the global step. If left as `None`, then slim.variables.get_or_create_global_step() is used. number_of_steps: The max number of gradient steps to take during training, as measured by 'global_step': training will stop if global_step is greater than 'number_of_steps'. If the value is left as None, training proceeds indefinitely. number_of_epochs: The total number of epochs per training. batch_size: The number of samples in each batch. init_op: The initialization operation. If left to its default value, then the session is initialized by calling `tf.global_variables_initializer()`. init_feed_dict: A feed dictionary to use when executing the `init_op`. local_init_op: The local initialization operation. If left to its default value, then the session is initialized by calling `tf.local_variables_initializer()` and `tf.tables_initializer()`. init_fn: An optional callable to be executed after `init_op` is called. The callable must accept one argument, the session being initialized. ready_op: Operation to check if the model is ready to use. If left to its default value, then the session checks for readiness by calling `tf.report_uninitialized_variables()`. summary_op: The summary operation. save_summaries_secs: How often, in seconds, to save summaries. summary_writer: `SummaryWriter` to use. Can be `None` to indicate that no summaries should be written. If unset, we create a SummaryWriter. startup_delay_steps: The number of steps to wait for before beginning. Note that this must be 0 if a sync_optimizer is supplied. saver: Saver to save checkpoints. If None, a default one will be created and used. save_interval_secs: How often, in seconds, to save the model to `logdir`. sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the argument is supplied, gradient updates will be synchronous. If left as `None`, gradient updates will be asynchronous. session_config: An instance of `tf.ConfigProto` that will be used to configure the `Session`. If left as `None`, the default will be used. trace_every_n_steps: produce and save a `Timeline` in Chrome trace format and add it to the summaries every `trace_every_n_steps`. If None, no trace information will be produced or saved. Returns: the value of the loss function after training. Raises: ValueError: if `train_op` is empty or if `startup_delay_steps` is non-zero when `sync_optimizer` is supplied, if `number_of_steps` is negative, or if `trace_every_n_steps` is not `None` and no `logdir` is provided. """ # Check if the calculation of some metrics is desired. if metric_op is not None: # Check the necessary requirements if metric_collection_name is None: raise ValueError('metric_collection_name must be fed and cannot be No') if number_of_samples is None: raise ValueError('number_of_samples must be fed and cannot be No') if number_of_steps is None: raise ValueError('number_of_steps must be fed and cannot be No') if number_of_epochs is None: raise ValueError('number_of_epochs must be fed and cannot be No') if batch_size is None: raise ValueError('batch_size must be fed and cannot be None') if train_op is None: raise ValueError('train_op cannot be None.') if logdir is None: if summary_op != _USE_DEFAULT: raise ValueError('Cannot provide summary_op because logdir=None') if saver is not None: raise ValueError('Cannot provide saver because logdir=None') if trace_every_n_steps is not None: raise ValueError('Cannot provide trace_every_n_steps because ' 'logdir=None') if sync_optimizer is not None and startup_delay_steps > 0: raise ValueError( 'startup_delay_steps must be zero when sync_optimizer is supplied.') if number_of_steps is not None and number_of_steps <= 0: raise ValueError( '`number_of_steps` must be either None or a positive number.') graph = graph or ops.get_default_graph() with graph.as_default(): if global_step is None: global_step = variables.get_or_create_global_step() saver = saver or tf_saver.Saver() with ops.name_scope('init_ops'): if init_op == _USE_DEFAULT: init_op = tf_variables.global_variables_initializer() if ready_op == _USE_DEFAULT: ready_op = tf_variables.report_uninitialized_variables() if local_init_op == _USE_DEFAULT: local_init_op = control_flow_ops.group( tf_variables.local_variables_initializer(), lookup_ops.tables_initializer()) if sync_optimizer is not None and isinstance( sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer): with ops.control_dependencies([local_init_op] if local_init_op is not None else []): if is_chief: local_init_op = sync_optimizer.chief_init_op else: local_init_op = sync_optimizer.local_step_init_op ready_for_local_init_op = sync_optimizer.ready_for_local_init_op else: ready_for_local_init_op = None if summary_op == _USE_DEFAULT: summary_op = summary.merge_all() if summary_writer == _USE_DEFAULT: summary_writer = supervisor.Supervisor.USE_DEFAULT if is_chief and sync_optimizer is not None: if not isinstance(sync_optimizer, (sync_replicas_optimizer.SyncReplicasOptimizer)): raise ValueError( '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.') # Need to create these BEFORE the supervisor finalizes the graph: init_tokens_op = sync_optimizer.get_init_tokens_op() chief_queue_runner = sync_optimizer.get_chief_queue_runner() if train_step_kwargs == _USE_DEFAULT: with ops.name_scope('train_step'): train_step_kwargs = {} if number_of_steps: should_stop_op = math_ops.greater_equal(global_step, number_of_steps) else: should_stop_op = constant_op.constant(False) train_step_kwargs['should_stop'] = should_stop_op train_step_kwargs['should_log'] = math_ops.equal( math_ops.mod(global_step, log_every_n_steps), 0) if is_chief and trace_every_n_steps is not None: train_step_kwargs['should_trace'] = math_ops.equal( math_ops.mod(global_step, trace_every_n_steps), 0) train_step_kwargs['logdir'] = logdir if number_of_samples is not None and batch_size is not None: train_step_kwargs['num_batches_per_epoch'] = int(number_of_samples / float(batch_size)) if number_of_samples is not None and batch_size is not None: train_step_kwargs['num_steps_per_epoch'] = int(number_of_steps / float(number_of_epochs)) # If metric calculation is desired. if metric_op is not None: # The reset_op is defined for resetting the streaming_variables(streaming_acurracy,...) # The reason for defining it here is that the supervisor finalized the graph and the graph will be fixed after it. # By calling the reset_op in the train_step function, after each epoch the total & count variables will reset to zero. # This help to have the averaged accuracy per epoch which is useful to realize if we are getting to the highest accuracy in the training. stream_vars = [i for i in tf.local_variables() if i.name.split('/')[1] == metric_collection_name] reset_op = tf.variables_initializer(stream_vars) sv = supervisor.Supervisor( graph=graph, is_chief=is_chief, logdir=logdir, init_op=init_op, init_feed_dict=init_feed_dict, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, ready_op=ready_op, summary_op=summary_op, summary_writer=summary_writer, global_step=global_step, saver=saver, save_summaries_secs=save_summaries_secs, save_model_secs=save_interval_secs, init_fn=init_fn) if summary_writer is not None: train_step_kwargs['summary_writer'] = sv.summary_writer should_retry = True while should_retry: try: should_retry = False with sv.managed_session( master, start_standard_services=False, config=session_config) as sess: logging.info('Starting Session.') if is_chief: if logdir: sv.start_standard_services(sess) elif startup_delay_steps > 0: _wait_for_step(sess, global_step, min(startup_delay_steps, number_of_steps or sys.maxint)) sv.start_queue_runners(sess) logging.info('Starting Queues.') if is_chief and sync_optimizer is not None: sv.start_queue_runners(sess, [chief_queue_runner]) sess.run(init_tokens_op) try: while not sv.should_stop(): if metric_op is not None: total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs, metric_op, reset_op) else: total_loss, should_stop = train_step_fn( sess, train_op, global_step, train_step_kwargs) if should_stop: logging.info('Stopping Training.') break except errors.OutOfRangeError: # OutOfRangeError is thrown when epoch limit per # tf.train.limit_epochs is reached. logging.info('Caught OutOfRangeError. Stopping Training.') if logdir and sv.is_chief: logging.info('Finished training! Saving model to disk.') sv.saver.save(sess, sv.save_path, global_step=sv.global_step) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. logging.info('Retrying training!') should_retry = True return total_loss
def begin(self): if self._summary_writer is None and self._log_dir: self._summary_writer = summary.FileWriterCache.get(self._log_dir) if self._summary_op is None: self._summary_op = summary.merge_all()