Exemplo n.º 1
0
  def testAddImageComparisonSummaries(self):
    summaries.add_image_comparison_summaries(
        get_gan_model(), display_diffs=True)

    self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
    with self.test_session(use_gpu=True):
      summary.merge_all().eval()
Exemplo n.º 2
0
  def testAddGanModelSummaries(self):
    summaries.add_gan_model_summaries(get_gan_model())

    self.assertEquals(3, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
    with self.test_session(use_gpu=True):
      variables.global_variables_initializer().run()
      summary.merge_all().eval()
Exemplo n.º 3
0
  def _test_add_regularization_loss_summaries_impl(self, get_model_fn,
                                                   expected_num_summary_ops):
    summaries.add_regularization_loss_summaries(get_model_fn())

    self.assertEquals(expected_num_summary_ops,
                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
    with self.test_session(use_gpu=True):
      summary.merge_all().eval()
Exemplo n.º 4
0
  def _test_add_image_comparison_summaries_impl(self, get_model_fn,
                                                expected_num_summary_ops):
    summaries.add_image_comparison_summaries(get_model_fn(), display_diffs=True)

    self.assertEquals(expected_num_summary_ops,
                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
    with self.test_session(use_gpu=True):
      summary.merge_all().eval()
Exemplo n.º 5
0
  def test_add_image_comparison_summaries_for_stargan(self):

    summaries.add_stargan_image_summaries(get_stargan_model())

    self.assertEquals(1, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))

    with self.test_session(use_gpu=True) as sess:
      sess.run(variables.global_variables_initializer())
      summary.merge_all().eval()
Exemplo n.º 6
0
  def _test_add_gan_model_summaries_impl(self, get_model_fn,
                                         expected_num_summary_ops):
    summaries.add_gan_model_summaries(get_model_fn())

    self.assertEquals(expected_num_summary_ops,
                      len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
    with self.test_session(use_gpu=True):
      variables.global_variables_initializer().run()
      summary.merge_all().eval()
Exemplo n.º 7
0
  def testClassicSummaryOpsErrorOut(self):
    x = constant_op.constant(42)
    x_summary = summary.scalar('x', x)
    y = constant_op.constant([1, 3, 3, 7])
    y_summary = summary.histogram('hist', y)

    with self.assertRaisesRegexp(
        RuntimeError,
        r'Merging tf\.summary\.\* ops is not compatible with eager execution'):
      summary.merge([x_summary, y_summary])

    with self.assertRaisesRegexp(
        RuntimeError,
        r'Merging tf\.summary\.\* ops is not compatible with eager execution'):
      summary.merge_all()
Exemplo n.º 8
0
  def testTFSummaryScalar(self):
    """Verify processing of tf.summary.scalar."""
    event_sink = _EventGenerator(self, zero_out_timestamps=True)
    writer = SummaryToEventTransformer(event_sink)
    with self.test_session() as sess:
      ipt = array_ops.placeholder(dtypes.float32)
      summary_lib.scalar('scalar1', ipt)
      summary_lib.scalar('scalar2', ipt * ipt)
      merged = summary_lib.merge_all()
      writer.add_graph(sess.graph)
      for i in xrange(10):
        summ = sess.run(merged, feed_dict={ipt: i})
        writer.add_summary(summ, global_step=i)

    accumulator = ea.EventAccumulator(event_sink)
    accumulator.Reload()

    seq1 = [ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)]
    seq2 = [
        ea.ScalarEvent(
            wall_time=0, step=i, value=i * i) for i in xrange(10)
    ]

    self.assertTagsEqual(accumulator.Tags(), {
        ea.SCALARS: ['scalar1', 'scalar2'],
        ea.GRAPH: True,
        ea.META_GRAPH: False,
    })

    self.assertEqual(accumulator.Scalars('scalar1'), seq1)
    self.assertEqual(accumulator.Scalars('scalar2'), seq2)
    first_value = accumulator.Scalars('scalar1')[0].value
    self.assertTrue(isinstance(first_value, float))
Exemplo n.º 9
0
  def testTFSummaryTensor(self):
    """Verify processing of tf.summary.tensor."""
    event_sink = _EventGenerator(self, zero_out_timestamps=True)
    writer = SummaryToEventTransformer(event_sink)
    with self.test_session() as sess:
      summary_lib.tensor_summary('scalar', constant_op.constant(1.0))
      summary_lib.tensor_summary('vector', constant_op.constant(
          [1.0, 2.0, 3.0]))
      summary_lib.tensor_summary('string',
                                 constant_op.constant(six.b('foobar')))
      merged = summary_lib.merge_all()
      summ = sess.run(merged)
      writer.add_summary(summ, 0)

    accumulator = ea.EventAccumulator(event_sink)
    accumulator.Reload()

    self.assertTagsEqual(accumulator.Tags(), {
        ea.TENSORS: ['scalar', 'vector', 'string'],
    })

    scalar_proto = accumulator.Tensors('scalar')[0].tensor_proto
    scalar = tensor_util.MakeNdarray(scalar_proto)
    vector_proto = accumulator.Tensors('vector')[0].tensor_proto
    vector = tensor_util.MakeNdarray(vector_proto)
    string_proto = accumulator.Tensors('string')[0].tensor_proto
    string = tensor_util.MakeNdarray(string_proto)

    self.assertTrue(np.array_equal(scalar, 1.0))
    self.assertTrue(np.array_equal(vector, [1.0, 2.0, 3.0]))
    self.assertTrue(np.array_equal(string, six.b('foobar')))
Exemplo n.º 10
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:

        for weight in layer.weights:
          tf_summary.histogram(weight.name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = w_img.get_shape()
            if len(shape) > 1 and shape[0] > shape[1]:
              w_img = array_ops.transpose(w_img)
            if len(shape) == 1:
              w_img = array_ops.expand_dims(w_img, 0)
            w_img = array_ops.expand_dims(array_ops.expand_dims(w_img, 0), -1)
            tf_summary.image(weight.name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)
 def testMergeAllSummaries(self):
   with ops.Graph().as_default():
     const = constant_op.constant(10.0)
     summ1 = summary.histogram("h", const)
     summ2 = summary.scalar("o", const, collections=["foo_key"])
     summ3 = summary.scalar("c", const)
     merge = summary.merge_all()
     self.assertEqual("MergeSummary", merge.op.type)
     self.assertEqual(2, len(merge.op.inputs))
     self.assertEqual(summ1, merge.op.inputs[0])
     self.assertEqual(summ3, merge.op.inputs[1])
     merge = summary.merge_all("foo_key")
     self.assertEqual("MergeSummary", merge.op.type)
     self.assertEqual(1, len(merge.op.inputs))
     self.assertEqual(summ2, merge.op.inputs[0])
     self.assertTrue(summary.merge_all("bar_key") is None)
Exemplo n.º 12
0
  def testTFSummaryImage(self):
    """Verify processing of tf.summary.image."""
    event_sink = _EventGenerator(self, zero_out_timestamps=True)
    writer = SummaryToEventTransformer(event_sink)
    with self.test_session() as sess:
      ipt = array_ops.ones([10, 4, 4, 3], dtypes.uint8)
      # This is an interesting example, because the old tf.image_summary op
      # would throw an error here, because it would be tag reuse.
      # Using the tf node name instead allows argument re-use to the image
      # summary.
      with ops.name_scope('1'):
        summary_lib.image('images', ipt, max_outputs=1)
      with ops.name_scope('2'):
        summary_lib.image('images', ipt, max_outputs=2)
      with ops.name_scope('3'):
        summary_lib.image('images', ipt, max_outputs=3)
      merged = summary_lib.merge_all()
      writer.add_graph(sess.graph)
      for i in xrange(10):
        summ = sess.run(merged)
        writer.add_summary(summ, global_step=i)

    accumulator = ea.EventAccumulator(event_sink)
    accumulator.Reload()

    tags = [
        u'1/images/image', u'2/images/image/0', u'2/images/image/1',
        u'3/images/image/0', u'3/images/image/1', u'3/images/image/2'
    ]

    self.assertTagsEqual(accumulator.Tags(), {
        ea.IMAGES: tags,
        ea.GRAPH: True,
        ea.META_GRAPH: False,
    })
Exemplo n.º 13
0
 def _summary_computed():
   with ops.Graph().as_default():
     sv = supervisor.Supervisor(is_chief=False)
     sess = sv.prepare_or_wait_for_session("")
     summary.scalar("c1", constant_op.constant(1))
     summary.scalar("c2", constant_op.constant(2))
     summ = summary.merge_all()
     sv.summary_computed(sess, sess.run(summ))
Exemplo n.º 14
0
  def set_model(self, model):
    """Sets Keras model and creates summary ops."""

    self.model = model
    self.sess = K.get_session()
    # only make histogram summary op if it hasn't already been made
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if self.write_grads:
          for weight in layer.trainable_weights:
            mapped_weight_name = weight.name.replace(':', '_')
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [grad.values if is_indexed_slices(grad) else grad
                     for grad in grads]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = self._writer_class(self.log_dir, self.sess.graph)
    else:
      self.writer = self._writer_class(self.log_dir)
Exemplo n.º 15
0
 def testNoLogdirButWantSummary(self):
   with ops.Graph().as_default():
     summary.scalar("c1", constant_op.constant(1))
     summary.scalar("c2", constant_op.constant(2))
     summary.scalar("c3", constant_op.constant(3))
     summ = summary.merge_all()
     sv = supervisor.Supervisor(logdir="", summary_op=None)
     sess = sv.prepare_or_wait_for_session("")
     with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
       sv.summary_computed(sess, sess.run(summ))
    def testChiefCanWriteEvents(self):
        logdir = _test_dir("can_write")
        with ops.Graph().as_default():
            summary.scalar("c1", constant_op.constant(1))
            summary.scalar("c2", constant_op.constant(2))
            summary.scalar("c3", constant_op.constant(3))
            summ = summary.merge_all()
            sv = supervisor.Supervisor(is_chief=True,
                                       logdir=logdir,
                                       summary_op=None)
            meta_graph_def = meta_graph.create_meta_graph_def()
            sess = sv.prepare_or_wait_for_session("")
            sv.summary_computed(sess, sess.run(summ))
            sess.close()
            # Wait to make sure everything is written to file before stopping.
            time.sleep(1)
            sv.stop()

        rr = _summary_iterator(logdir)

        # The first event should list the file_version.
        ev = next(rr)
        self.assertEquals("brain.Event:2", ev.file_version)

        # The next one has the graph.
        ev = next(rr)
        ev_graph = graph_pb2.GraphDef()
        ev_graph.ParseFromString(ev.graph_def)
        self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True),
                               ev_graph)

        # Stored MetaGraphDef
        ev = next(rr)
        ev_meta_graph = meta_graph_pb2.MetaGraphDef()
        ev_meta_graph.ParseFromString(ev.meta_graph_def)
        self.assertProtoEquals(meta_graph_def, ev_meta_graph)
        self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True),
                               ev_meta_graph.graph_def)
        # The next one should have the values from the summary.
        ev = next(rr)
        self.assertProtoEquals(
            """
      value { tag: 'c1' simple_value: 1.0 }
      value { tag: 'c2' simple_value: 2.0 }
      value { tag: 'c3' simple_value: 3.0 }
      """, ev.summary)

        # The next one should be a stop message if we closed cleanly.
        ev = next(rr)
        self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)

        # We should be done.
        self.assertRaises(StopIteration, lambda: next(rr))
Exemplo n.º 17
0
    def testManagedSessionKeepSummaryWriter(self):
        logdir = self._test_dir("managed_keep_summary_writer")
        with ops.Graph().as_default():
            summary.scalar("c1", constant_op.constant(1))
            summary.scalar("c2", constant_op.constant(2))
            summary.scalar("c3", constant_op.constant(3))
            summ = summary.merge_all()
            sv = supervisor.Supervisor(logdir=logdir)
            with sv.managed_session("",
                                    close_summary_writer=False,
                                    start_standard_services=False) as sess:
                sv.summary_computed(sess, sess.run(summ))
            with sv.managed_session("",
                                    close_summary_writer=False,
                                    start_standard_services=False) as sess:
                sv.summary_computed(sess, sess.run(summ))
        # Now close the summary writer to flush the events.
        sv.summary_writer.close()
        # The summary iterator should report the summary twice as we reused
        # the same summary writer across the 2 sessions.
        rr = _summary_iterator(logdir)
        # The first event should list the file_version.
        ev = next(rr)
        self.assertEquals("brain.Event:2", ev.file_version)

        # The next one has the graph.
        ev = next(rr)
        self.assertTrue(ev.graph_def)

        ev = next(rr)
        self.assertTrue(ev.meta_graph_def)

        # The next one should have the values from the summary.
        ev = next(rr)
        self.assertProtoEquals(
            """
      value { tag: 'c1' simple_value: 1.0 }
      value { tag: 'c2' simple_value: 2.0 }
      value { tag: 'c3' simple_value: 3.0 }
      """, ev.summary)

        # The next one should also have the values from the summary.
        ev = next(rr)
        self.assertProtoEquals(
            """
      value { tag: 'c1' simple_value: 1.0 }
      value { tag: 'c2' simple_value: 2.0 }
      value { tag: 'c3' simple_value: 3.0 }
      """, ev.summary)

        # We should be done.
        self.assertRaises(StopIteration, lambda: next(rr))
Exemplo n.º 18
0
  def testNoLogdirButExplicitSummaryWriter(self):
    logdir = self._test_dir("explicit_summary_writer")
    with ops.Graph().as_default():
      summary.scalar("c1", constant_op.constant(1))
      summary.scalar("c2", constant_op.constant(2))
      summary.scalar("c3", constant_op.constant(3))
      summ = summary.merge_all()
      sw = writer.FileWriter(logdir)
      sv = supervisor.Supervisor(logdir="", summary_op=None, summary_writer=sw)
      meta_graph_def = meta_graph.create_meta_graph_def()
      sess = sv.prepare_or_wait_for_session("")
      sv.summary_computed(sess, sess.run(summ))
      sess.close()
      # Wait to make sure everything is written to file before stopping.
      time.sleep(1)
      sv.stop()

    # Check the summary was written to 'logdir'
    rr = _summary_iterator(logdir)

    # The first event should list the file_version.
    ev = next(rr)
    self.assertEquals("brain.Event:2", ev.file_version)

    # The next one has the graph.
    ev = next(rr)
    ev_graph = graph_pb2.GraphDef()
    ev_graph.ParseFromString(ev.graph_def)
    self.assertProtoEquals(sess.graph.as_graph_def(add_shapes=True), ev_graph)

    # Stored MetaGraphDef
    ev = next(rr)
    ev_meta_graph = meta_graph_pb2.MetaGraphDef()
    ev_meta_graph.ParseFromString(ev.meta_graph_def)
    self.assertProtoEquals(meta_graph_def, ev_meta_graph)
    self.assertProtoEquals(
        sess.graph.as_graph_def(add_shapes=True), ev_meta_graph.graph_def)

    # The next one should have the values from the summary.
    ev = next(rr)
    self.assertProtoEquals("""
      value { tag: 'c1' simple_value: 1.0 }
      value { tag: 'c2' simple_value: 2.0 }
      value { tag: 'c3' simple_value: 3.0 }
      """, ev.summary)

    # The next one should be a stop message if we closed cleanly.
    ev = next(rr)
    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)

    # We should be done.
    self.assertRaises(StopIteration, lambda: next(rr))
Exemplo n.º 19
0
  def testManagedSessionDoNotKeepSummaryWriter(self):
    logdir = self._test_dir("managed_not_keep_summary_writer")
    with ops.Graph().as_default():
      summary.scalar("c1", constant_op.constant(1))
      summary.scalar("c2", constant_op.constant(2))
      summary.scalar("c3", constant_op.constant(3))
      summ = summary.merge_all()
      sv = supervisor.Supervisor(logdir=logdir, summary_op=None)
      with sv.managed_session(
          "", close_summary_writer=True, start_standard_services=False) as sess:
        sv.summary_computed(sess, sess.run(summ))
      # Sleep 1.2s to make sure that the next event file has a different name
      # than the current one.
      time.sleep(1.2)
      with sv.managed_session(
          "", close_summary_writer=True, start_standard_services=False) as sess:
        sv.summary_computed(sess, sess.run(summ))
    event_paths = sorted(glob.glob(os.path.join(logdir, "event*")))
    self.assertEquals(2, len(event_paths))
    # The two event files should have the same contents.
    for path in event_paths:
      # The summary iterator should report the summary once as we closed the
      # summary writer across the 2 sessions.
      rr = summary_iterator.summary_iterator(path)
      # The first event should list the file_version.
      ev = next(rr)
      self.assertEquals("brain.Event:2", ev.file_version)

      # The next one has the graph and metagraph.
      ev = next(rr)
      self.assertTrue(ev.graph_def)

      ev = next(rr)
      self.assertTrue(ev.meta_graph_def)

      # The next one should have the values from the summary.
      # But only once.
      ev = next(rr)
      self.assertProtoEquals("""
        value { tag: 'c1' simple_value: 1.0 }
        value { tag: 'c2' simple_value: 2.0 }
        value { tag: 'c3' simple_value: 3.0 }
        """, ev.summary)

      # The next one should be a stop message if we closed cleanly.
      ev = next(rr)
      self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)

      # We should be done.
      with self.assertRaises(StopIteration):
        next(rr)
Exemplo n.º 20
0
  def testManagedSessionDoNotKeepSummaryWriter(self):
    logdir = self._test_dir("managed_not_keep_summary_writer")
    with ops.Graph().as_default():
      summary.scalar("c1", constant_op.constant(1))
      summary.scalar("c2", constant_op.constant(2))
      summary.scalar("c3", constant_op.constant(3))
      summ = summary.merge_all()
      sv = supervisor.Supervisor(logdir=logdir, summary_op=None)
      with sv.managed_session(
          "", close_summary_writer=True, start_standard_services=False) as sess:
        sv.summary_computed(sess, sess.run(summ))
      # Sleep 1.2s to make sure that the next event file has a different name
      # than the current one.
      time.sleep(1.2)
      with sv.managed_session(
          "", close_summary_writer=True, start_standard_services=False) as sess:
        sv.summary_computed(sess, sess.run(summ))
    event_paths = sorted(glob.glob(os.path.join(logdir, "event*")))
    self.assertEquals(2, len(event_paths))
    # The two event files should have the same contents.
    for path in event_paths:
      # The summary iterator should report the summary once as we closed the
      # summary writer across the 2 sessions.
      rr = summary_iterator.summary_iterator(path)
      # The first event should list the file_version.
      ev = next(rr)
      self.assertEquals("brain.Event:2", ev.file_version)

      # The next one has the graph and metagraph.
      ev = next(rr)
      self.assertTrue(ev.graph_def)

      ev = next(rr)
      self.assertTrue(ev.meta_graph_def)

      # The next one should have the values from the summary.
      # But only once.
      ev = next(rr)
      self.assertProtoEquals("""
        value { tag: 'c1' simple_value: 1.0 }
        value { tag: 'c2' simple_value: 2.0 }
        value { tag: 'c3' simple_value: 3.0 }
        """, ev.summary)

      # The next one should be a stop message if we closed cleanly.
      ev = next(rr)
      self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)

      # We should be done.
      with self.assertRaises(StopIteration):
        next(rr)
Exemplo n.º 21
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_grads:
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [grad.values if is_indexed_slices(grad) else grad
                     for grad in grads]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)
Exemplo n.º 22
0
  def _init_summary_op(self, summary_op=USE_DEFAULT):
    """Initializes summary_op.

    Args:
      summary_op: An Operation that returns a Summary for the event logs. If set
        to USE_DEFAULT, create an op that merges all the summaries.
    """
    if summary_op is Supervisor.USE_DEFAULT:
      summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP)
      if summary_op is None:
        summary_op = _summary.merge_all()
        if summary_op is not None:
          ops.add_to_collection(ops.GraphKeys.SUMMARY_OP, summary_op)
    self._summary_op = summary_op
Exemplo n.º 23
0
  def testManagedSessionKeepSummaryWriter(self):
    logdir = self._test_dir("managed_keep_summary_writer")
    with ops.Graph().as_default():
      summary.scalar("c1", constant_op.constant(1))
      summary.scalar("c2", constant_op.constant(2))
      summary.scalar("c3", constant_op.constant(3))
      summ = summary.merge_all()
      sv = supervisor.Supervisor(logdir=logdir)
      with sv.managed_session(
          "", close_summary_writer=False,
          start_standard_services=False) as sess:
        sv.summary_computed(sess, sess.run(summ))
      with sv.managed_session(
          "", close_summary_writer=False,
          start_standard_services=False) as sess:
        sv.summary_computed(sess, sess.run(summ))
    # Now close the summary writer to flush the events.
    sv.summary_writer.close()
    # The summary iterator should report the summary twice as we reused
    # the same summary writer across the 2 sessions.
    rr = _summary_iterator(logdir)
    # The first event should list the file_version.
    ev = next(rr)
    self.assertEquals("brain.Event:2", ev.file_version)

    # The next one has the graph.
    ev = next(rr)
    self.assertTrue(ev.graph_def)

    ev = next(rr)
    self.assertTrue(ev.meta_graph_def)

    # The next one should have the values from the summary.
    ev = next(rr)
    self.assertProtoEquals("""
      value { tag: 'c1' simple_value: 1.0 }
      value { tag: 'c2' simple_value: 2.0 }
      value { tag: 'c3' simple_value: 3.0 }
      """, ev.summary)

    # The next one should also have the values from the summary.
    ev = next(rr)
    self.assertProtoEquals("""
      value { tag: 'c1' simple_value: 1.0 }
      value { tag: 'c2' simple_value: 2.0 }
      value { tag: 'c3' simple_value: 3.0 }
      """, ev.summary)

    # We should be done.
    self.assertRaises(StopIteration, lambda: next(rr))
Exemplo n.º 24
0
  def _init_summary_op(self, summary_op=USE_DEFAULT):
    """Initializes summary_op.

    Args:
      summary_op: An Operation that returns a Summary for the event logs.
        If set to USE_DEFAULT, create an op that merges all the summaries.
    """
    if summary_op is Supervisor.USE_DEFAULT:
      summary_op = self._get_first_op_from_collection(ops.GraphKeys.SUMMARY_OP)
      if summary_op is None:
        summary_op = _summary.merge_all()
        if summary_op is not None:
          ops.add_to_collection(ops.GraphKeys.SUMMARY_OP, summary_op)
    self._summary_op = summary_op
Exemplo n.º 25
0
 def testLogdirButExplicitlyNoSummaryWriter(self):
   logdir = self._test_dir("explicit_no_summary_writer")
   with ops.Graph().as_default():
     variables.VariableV1([1.0], name="foo")
     summary.scalar("c1", constant_op.constant(1))
     summary.scalar("c2", constant_op.constant(2))
     summary.scalar("c3", constant_op.constant(3))
     summ = summary.merge_all()
     sv = supervisor.Supervisor(logdir=logdir, summary_writer=None)
     sess = sv.prepare_or_wait_for_session("")
     # Check that a checkpoint is still be generated.
     self._wait_for_glob(sv.save_path, 3.0)
     # Check that we cannot write a summary
     with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
       sv.summary_computed(sess, sess.run(summ))
Exemplo n.º 26
0
 def testLogdirButExplicitlyNoSummaryWriter(self):
   logdir = self._test_dir("explicit_no_summary_writer")
   with ops.Graph().as_default():
     variables.Variable([1.0], name="foo")
     summary.scalar("c1", constant_op.constant(1))
     summary.scalar("c2", constant_op.constant(2))
     summary.scalar("c3", constant_op.constant(3))
     summ = summary.merge_all()
     sv = supervisor.Supervisor(logdir=logdir, summary_writer=None)
     sess = sv.prepare_or_wait_for_session("")
     # Check that a checkpoint is still be generated.
     self._wait_for_glob(sv.save_path, 3.0)
     # Check that we cannot write a summary
     with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
       sv.summary_computed(sess, sess.run(summ))
Exemplo n.º 27
0
    def set_model(self, model):
        """
        Overwrite set_model method
        """
        super().set_model(model)

        if self.write_input:
            input_imgs = self.model.input

            assert len(
                K.int_shape(input_imgs)
            ) == 4, 'Should be the 4-D images tensor [batch_size,height,width,channels]'
            tf.summary.image('Input_Image',
                             input_imgs,
                             max_outputs=self.max_result_display)

        self.merged = tf_summary.merge_all()
Exemplo n.º 28
0
    def verify_scalar_summary_is_written(self, print_summary):
        value = 3
        tensor = array_ops.ones([]) * value
        name = 'my_score'
        prefix = 'eval'
        summaries.add_scalar_summary(tensor, name, prefix, print_summary)

        output_dir = tempfile.mkdtemp('scalar_summary_no_print_test')
        summary_op = summary.merge_all()

        summary_writer = summary.FileWriter(output_dir)
        with self.cached_session() as sess:
            new_summary = sess.run(summary_op)
            summary_writer.add_summary(new_summary, 1)
            summary_writer.flush()

        self.assert_scalar_summary(output_dir,
                                   {'%s/%s' % (prefix, name): value})
Exemplo n.º 29
0
  def testTrainWithNoneAsLogdirWhenUsingSummariesRaisesError(self):
    with ops.Graph().as_default():
      random_seed.set_random_seed(0)
      tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32)
      tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32)

      tf_predictions = LogisticClassifier(tf_inputs)
      loss_ops.log_loss(tf_predictions, tf_labels)
      total_loss = loss_ops.get_total_loss()
      summary.scalar('total_loss', total_loss)

      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0)

      train_op = learning.create_train_op(total_loss, optimizer)
      summary_op = summary.merge_all()

      with self.assertRaises(ValueError):
        learning.train(
            train_op, None, number_of_steps=300, summary_op=summary_op)
    def testTFSummaryImage(self):
        """Verify processing of tf.summary.image."""
        event_sink = _EventGenerator(zero_out_timestamps=True)
        writer = SummaryToEventTransformer(event_sink)
        with self.test_session() as sess:
            ipt = array_ops.ones([10, 4, 4, 3], dtypes.uint8)
            # This is an interesting example, because the old tf.image_summary op
            # would throw an error here, because it would be tag reuse.
            # Using the tf node name instead allows argument re-use to the image
            # summary.
            with ops.name_scope('1'):
                summary_lib.image('images', ipt, max_outputs=1)
            with ops.name_scope('2'):
                summary_lib.image('images', ipt, max_outputs=2)
            with ops.name_scope('3'):
                summary_lib.image('images', ipt, max_outputs=3)
            merged = summary_lib.merge_all()
            writer.add_graph(sess.graph)
            for i in xrange(10):
                summ = sess.run(merged)
                writer.add_summary(summ, global_step=i)

        accumulator = ea.EventAccumulator(event_sink)
        accumulator.Reload()

        tags = [
            u'1/images/image', u'2/images/image/0', u'2/images/image/1',
            u'3/images/image/0', u'3/images/image/1', u'3/images/image/2'
        ]

        self.assertTagsEqual(
            accumulator.Tags(), {
                ea.IMAGES: tags,
                ea.AUDIO: [],
                ea.SCALARS: [],
                ea.HISTOGRAMS: [],
                ea.COMPRESSED_HISTOGRAMS: [],
                ea.GRAPH: True,
                ea.META_GRAPH: False,
                ea.RUN_METADATA: []
            })
    def testTFSummaryScalar(self):
        """Verify processing of tf.summary.scalar, which uses TensorSummary op."""
        event_sink = _EventGenerator(zero_out_timestamps=True)
        writer = SummaryToEventTransformer(event_sink)
        with self.test_session() as sess:
            ipt = array_ops.placeholder(dtypes.float32)
            summary_lib.scalar('scalar1', ipt)
            summary_lib.scalar('scalar2', ipt * ipt)
            merged = summary_lib.merge_all()
            writer.add_graph(sess.graph)
            for i in xrange(10):
                summ = sess.run(merged, feed_dict={ipt: i})
                writer.add_summary(summ, global_step=i)

        accumulator = ea.EventAccumulator(event_sink)
        accumulator.Reload()

        seq1 = [
            ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)
        ]
        seq2 = [
            ea.ScalarEvent(wall_time=0, step=i, value=i * i)
            for i in xrange(10)
        ]

        self.assertTagsEqual(
            accumulator.Tags(), {
                ea.IMAGES: [],
                ea.AUDIO: [],
                ea.SCALARS: ['scalar1', 'scalar2'],
                ea.HISTOGRAMS: [],
                ea.COMPRESSED_HISTOGRAMS: [],
                ea.GRAPH: True,
                ea.META_GRAPH: False,
                ea.RUN_METADATA: []
            })

        self.assertEqual(accumulator.Scalars('scalar1'), seq1)
        self.assertEqual(accumulator.Scalars('scalar2'), seq2)
        first_value = accumulator.Scalars('scalar1')[0].value
        self.assertTrue(isinstance(first_value, float))
Exemplo n.º 32
0
  def verify_scalar_summary_is_written(self, print_summary):
    value = 3
    tensor = array_ops.ones([]) * value
    name = 'my_score'
    prefix = 'eval'
    summaries.add_scalar_summary(tensor, name, prefix, print_summary)

    output_dir = os.path.join(self.get_temp_dir(),
                              'scalar_summary_no_print_test')
    self.safe_create(output_dir)

    summary_op = summary.merge_all()

    summary_writer = summary.FileWriter(output_dir)
    with self.cached_session() as sess:
      new_summary = sess.run(summary_op)
      summary_writer.add_summary(new_summary, 1)
      summary_writer.flush()

    self.assert_scalar_summary(output_dir, {
        '%s/%s' % (prefix, name): value
    })
    def testTFSummaryScalar(self):
        """Verify processing of summary_lib.scalar."""
        event_sink = _EventGenerator(self, zero_out_timestamps=True)
        writer = FileWriter(self.get_temp_dir())
        writer.event_writer = event_sink
        with self.test_session() as sess:
            ipt = array_ops.placeholder(dtypes.float32)
            summary_lib.scalar('scalar1', ipt)
            summary_lib.scalar('scalar2', ipt * ipt)
            merged = summary_lib.merge_all()
            writer.add_graph(sess.graph)
            for i in xrange(10):
                summ = sess.run(merged, feed_dict={ipt: i})
                writer.add_summary(summ, global_step=i)

        accumulator = ea.EventAccumulator(event_sink)
        accumulator.Reload()

        seq1 = [
            ea.ScalarEvent(wall_time=0, step=i, value=i) for i in xrange(10)
        ]
        seq2 = [
            ea.ScalarEvent(wall_time=0, step=i, value=i * i)
            for i in xrange(10)
        ]

        self.assertTagsEqual(
            accumulator.Tags(), {
                ea.SCALARS: ['scalar1', 'scalar2'],
                ea.GRAPH: True,
                ea.META_GRAPH: False,
            })

        self.assertEqual(accumulator.Scalars('scalar1'), seq1)
        self.assertEqual(accumulator.Scalars('scalar2'), seq2)
        first_value = accumulator.Scalars('scalar1')[0].value
        self.assertTrue(isinstance(first_value, float))
  def generate_run(self, run_name):
    if run_name == self._RUN_WITH_HISTOGRAM:
      (use_histogram, use_scalars) = (True, False)
    elif run_name == self._RUN_WITH_SCALARS:
      (use_histogram, use_scalars) = (False, True)
    else:
      assert False, 'Invalid run name: %r' % run_name
    ops.reset_default_graph()
    sess = session.Session()
    placeholder = array_ops.placeholder(dtypes.float32, shape=[3])
    if use_histogram:
      summary.histogram(self._HISTOGRAM_TAG, placeholder)
    if use_scalars:
      summary.scalar(self._SCALAR_TAG, math_ops.reduce_mean(placeholder))
    summ = summary.merge_all()

    subdir = os.path.join(self.logdir, run_name)
    writer = summary.FileWriter(subdir)
    writer.add_graph(sess.graph)
    for step in xrange(self._STEPS):
      feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]}
      s = sess.run(summ, feed_dict=feed_dict)
      writer.add_summary(s, global_step=step)
    writer.close()
Exemplo n.º 35
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          session_wrapper=None,
          trace_every_n_steps=None,
          batch_size=1,
          num_examples=None,
          config_summary_list=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training,
      as measured by 'global_step': training will stop if global_step is
      greater than 'number_of_steps'. If the value is left as None, training
      proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of
      them. If the argument is supplied, gradient updates will be synchronous.
      If left as `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    session_wrapper: A function that takes a `tf.Session` object as the only
      argument and returns a wrapped session object that has the same methods
      that the original object has, or `None`. Iff not `None`, the wrapped
      object will be used for training.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.
    batch_size: batch size.
    num_examples: The number of examples in dataset for training.
    dubug_tensors: Additional tensors to run for debugging.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')
    if not isinstance(train_op, list):
        train_op = [train_op]

    # Allocate log function to each step.
    log_fn_list = [log.info, log.infov]

    def _iter_log_fn():
        for log_fn in log_fn_list:
            yield log_fn

    it = itertools.cycle(_iter_log_fn())
    current_log_fn = it.next()

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if isinstance(sync_optimizer,
                  sync_replicas_optimizer.SyncReplicasOptimizer):
        sync_optimizer = [sync_optimizer]
    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = training_util.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        if sync_optimizer is not None:
            for opt in sync_optimizer:
                if not isinstance(
                        opt, sync_replicas_optimizer.SyncReplicasOptimizer):
                    raise ValueError(
                        '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                    )

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    variables.local_variables_initializer(),
                    lookup_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(sync_optimizer, list):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = control_flow_ops.group(
                            *[opt.chief_init_op for opt in sync_optimizer])
                    else:
                        local_init_op = control_flow_ops.group(
                            *
                            [opt.local_step_init_op for opt in sync_optimizer])
                ready_for_local_init_op = control_flow_ops.group(
                    *[opt.ready_for_local_init_op for opt in sync_optimizer])
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = [
                opt.get_init_tokens_op() for opt in sync_optimizer
            ]
            chief_queue_runner = [
                opt.get_chief_queue_runner() for opt in sync_optimizer
            ]

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                if log_every_n_steps > 0:
                    train_step_kwargs['should_log'] = math_ops.equal(
                        math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    steps_in_epoch = int(num_examples / batch_size)

    total_loss = 0.0
    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                log.infov('Starting Session.')
                if session_wrapper is not None:
                    log.info('Wrapping session with wrapper function: %s',
                             session_wrapper)
                    sess = session_wrapper(sess)
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                threads = sv.start_queue_runners(sess)
                log.infov('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, chief_queue_runner)
                    sess.run(init_tokens_op)
                sess.graph.finalize()
                # try:
                if config_summary_list is not None:
                    for config_summary in config_summary_list:
                        sv.summary_writer.add_summary(
                            config_summary.eval(session=sess))

                while not sv.should_stop():
                    for _train_op in train_op:
                        total_loss, should_stop, np_global_step = train_step_fn(
                            sess, _train_op, global_step, train_step_kwargs,
                            batch_size, steps_in_epoch, current_log_fn)
                        if should_stop:
                            log.infov('Stopping Training.')
                            sv.request_stop()
                            break

                # except errors.OutOfRangeError:
                #   # OutOfRangeError is thrown when epoch limit per
                #   # tf.train.limit_epochs is reached.
                #   log.warn('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    log.warn('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)
                    sv.stop(threads, close_summary_writer=True)

                    def _last_checkpoint_path(sv_save_path,
                                              additional_dir_name='last'):
                        dir_list = sv_save_path.split('/')
                        dir_list.insert(-1, 'last')
                        last_checkpoint_dir_path = '/'.join(dir_list[:-1])
                        last_checkpoint_path = '/'.join(dir_list)
                        return last_checkpoint_dir_path, last_checkpoint_path

                    # Save the last checkpoint again to a 'last' directory for the next training with
                    # different configuration.
                    last_checkpoint_dir_path, last_checkpoint_path = _last_checkpoint_path(
                        sv.save_path, 'last')
                    if os.path.exists(last_checkpoint_dir_path):
                        shutil.rmtree(last_checkpoint_dir_path)
                    os.makedirs(last_checkpoint_dir_path)
                    sv.saver.save(sess,
                                  last_checkpoint_path,
                                  global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            log.warn('Retrying training!')
            should_retry = True

    return total_loss
Exemplo n.º 36
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_grads:
            grads = model.optimizer.get_gradients(model.total_loss, weight)
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)

    if self.embeddings_freq:
      embeddings_layer_names = self.embeddings_layer_names

      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      embeddings = {
          layer.name: layer.weights[0]
          for layer in self.model.layers if layer.name in embeddings_layer_names
      }

      self.saver = saver_lib.Saver(list(embeddings.values()))

      embeddings_metadata = {}

      if not isinstance(self.embeddings_metadata, str):
        embeddings_metadata = self.embeddings_metadata
      else:
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings.keys()
        }

      config = projector.ProjectorConfig()
      self.embeddings_ckpt_path = os.path.join(self.log_dir,
                                               'keras_embedding.ckpt')

      for layer_name, tensor in embeddings.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        if layer_name in embeddings_metadata:
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
Exemplo n.º 37
0
 def begin(self):
     self._summary_op = summary_lib.merge_all()
Exemplo n.º 38
0
def evaluate_once(master,
                  checkpoint_path,
                  logdir,
                  num_evals=1,
                  initial_op=None,
                  initial_op_feed_dict=None,
                  eval_op=None,
                  eval_op_feed_dict=None,
                  final_op=None,
                  final_op_feed_dict=None,
                  summary_op=_USE_DEFAULT,
                  summary_op_feed_dict=None,
                  variables_to_restore=None,
                  session_config=None,
                  hooks=None):
    """Evaluates the model at the given checkpoint path.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_path: The path to a checkpoint to use for evaluation.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.compat.v1.summary.merge_all().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    session_config: An instance of `tf.compat.v1.ConfigProto` that will be used
      to configure the `Session`. If left as `None`, the default will be used.
    hooks: A list of additional `SessionRunHook` objects to pass during the
      evaluation.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    all_hooks = [
        evaluation.StopAfterNEvalsHook(num_evals),
    ]

    if summary_op is not None:
        all_hooks.append(
            evaluation.SummaryAtEndHook(log_dir=logdir,
                                        summary_op=summary_op,
                                        feed_dict=summary_op_feed_dict))
    if hooks is not None:
        all_hooks.extend(hooks)

    saver = None
    if variables_to_restore is not None:
        saver = tf_saver.Saver(variables_to_restore)

    return evaluation.evaluate_once(checkpoint_path,
                                    master=master,
                                    scaffold=monitored_session.Scaffold(
                                        init_op=initial_op,
                                        init_feed_dict=initial_op_feed_dict,
                                        saver=saver),
                                    eval_ops=eval_op,
                                    feed_dict=eval_op_feed_dict,
                                    final_ops=final_op,
                                    final_ops_feed_dict=final_op_feed_dict,
                                    hooks=all_hooks,
                                    config=session_config)
Exemplo n.º 39
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    initial_op=None,
                    initial_op_feed_dict=None,
                    init_fn=None,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None,
                    session_config=None,
                    timeout=None,
                    timeout_fn=None,
                    hooks=None):
    """Runs TF-Slim's Evaluation Loop.

  Args:
    master: The BNS address of the TensorFlow master.
    checkpoint_dir: The directory where checkpoints are stored.
    logdir: The directory where the TensorFlow summaries are written to.
    num_evals: The number of times to run `eval_op`.
    initial_op: An operation run at the beginning of evaluation.
    initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    eval_op: A operation run `num_evals` times.
    eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
    final_op: An operation to execute after all of the `eval_op` executions. The
      value of `final_op` is returned.
    final_op_feed_dict: A feed dictionary to use when executing `final_op`.
    summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
      default the summary_op is set to tf.compat.v1.summary.merge_all().
    summary_op_feed_dict: An optional feed dictionary to use when running the
      `summary_op`.
    variables_to_restore: A list of TensorFlow variables to restore during
      evaluation. If the argument is left as `None` then
      slim.variables.GetVariablesToRestore() is used.
    eval_interval_secs: The minimum number of seconds between evaluations.
    max_number_of_evaluations: the max number of iterations of the evaluation.
      If the value is left as 'None', the evaluation continues indefinitely.
    session_config: An instance of `tf.compat.v1.ConfigProto` that will be used
      to configure the `Session`. If left as `None`, the default will be used.
    timeout: The maximum amount of time to wait between checkpoints. If left as
      `None`, then the process will wait indefinitely.
    timeout_fn: Optional function to call after a timeout.  If the function
      returns True, then it means that no new checkpoints will be generated and
      the iterator will exit.  The function is called with no arguments.
    hooks: A list of additional `SessionRunHook` objects to pass during repeated
      evaluations.

  Returns:
    The value of `final_op` or `None` if `final_op` is `None`.
  """
    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    all_hooks = [
        evaluation.StopAfterNEvalsHook(num_evals),
    ]

    if summary_op is not None:
        all_hooks.append(
            evaluation.SummaryAtEndHook(log_dir=logdir,
                                        summary_op=summary_op,
                                        feed_dict=summary_op_feed_dict))

    if hooks is not None:
        # Add custom hooks if provided.
        all_hooks.extend(hooks)

    saver = None
    if variables_to_restore is not None:
        saver = tf_saver.Saver(variables_to_restore)

    return evaluation.evaluate_repeatedly(
        checkpoint_dir,
        master=master,
        scaffold=monitored_session.Scaffold(
            init_op=initial_op,
            init_feed_dict=initial_op_feed_dict,
            init_fn=init_fn,
            saver=saver),
        eval_ops=eval_op,
        feed_dict=eval_op_feed_dict,
        final_ops=final_op,
        final_ops_feed_dict=final_op_feed_dict,
        eval_interval_secs=eval_interval_secs,
        hooks=all_hooks,
        config=session_config,
        max_number_of_evaluations=max_number_of_evaluations,
        timeout=timeout,
        timeout_fn=timeout_fn)
Exemplo n.º 40
0
 def begin(self):
     if self._replace_summary_op:
         self._summary_op = summary.merge_all()
     self._global_step = variables.get_or_create_global_step()
Exemplo n.º 41
0
    def test_add_image_comparison_summaries_for_cyclegan(self):
        summaries.add_cyclegan_image_summaries(get_cyclegan_model())

        self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
        with self.test_session(use_gpu=True):
            summary.merge_all().eval()
Exemplo n.º 42
0
    def set_model(self, model):
        """Sets Keras model and creates summary ops."""

        self.model = model
        self.sess = K.get_session()
        # only make histogram summary op if it hasn't already been made
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf_summary.histogram(mapped_weight_name, weight)
                    if self.write_images:
                        w_img = array_ops.squeeze(weight)
                        shape = K.int_shape(w_img)
                        if len(shape) == 2:  # dense layer kernel case
                            if shape[0] > shape[1]:
                                w_img = array_ops.transpose(w_img)
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [1, shape[0], shape[1], 1])
                        elif len(shape) == 3:  # convnet case
                            if K.image_data_format() == 'channels_last':
                                # switch to channels_first to display
                                # every kernel as a separate image
                                w_img = array_ops.transpose(w_img,
                                                            perm=[2, 0, 1])
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [shape[0], shape[1], shape[2], 1])
                        elif len(shape) == 1:  # bias case
                            w_img = array_ops.reshape(w_img,
                                                      [1, shape[0], 1, 1])
                        else:
                            # not possible to handle 3D convnets etc.
                            continue

                        shape = K.int_shape(w_img)
                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
                        tf_summary.image(mapped_weight_name, w_img)

                if self.write_grads:
                    for weight in layer.trainable_weights:
                        mapped_weight_name = weight.name.replace(':', '_')
                        grads = model.optimizer.get_gradients(
                            model.total_loss, weight)

                        def is_indexed_slices(grad):
                            return type(grad).__name__ == 'IndexedSlices'

                        grads = [
                            grad.values if is_indexed_slices(grad) else grad
                            for grad in grads
                        ]
                        tf_summary.histogram(
                            '{}_grad'.format(mapped_weight_name), grads)

                if hasattr(layer, 'output'):
                    if isinstance(layer.output, list):
                        for i, output in enumerate(layer.output):
                            tf_summary.histogram(
                                '{}_out_{}'.format(layer.name, i), output)
                    else:
                        tf_summary.histogram('{}_out'.format(layer.name),
                                             layer.output)
        self.merged = tf_summary.merge_all()

        if self.write_graph:
            self.writer = self._writer_class(self.log_dir, self.sess.graph)
        else:
            self.writer = self._writer_class(self.log_dir)

        # If both embedding_freq and embeddings_data are available, we will
        # visualize embeddings.
        if self.embeddings_freq and self.embeddings_data is not None:
            self.embeddings_data = standardize_input_data(
                self.embeddings_data, model.input_names)

            # If embedding_layer_names are not provided, get all of the embedding
            # layers from the model.
            embeddings_layer_names = self.embeddings_layer_names
            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            self.assign_embeddings = []
            embeddings_vars = {}

            self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
            self.step = step = array_ops.placeholder(dtypes.int32)

            for layer in self.model.layers:
                if layer.name in embeddings_layer_names:
                    embedding_input = self.model.get_layer(layer.name).output
                    embedding_size = np.prod(embedding_input.shape[1:])
                    embedding_input = array_ops.reshape(
                        embedding_input, (step, int(embedding_size)))
                    shape = (self.embeddings_data[0].shape[0],
                             int(embedding_size))
                    embedding = variables.Variable(array_ops.zeros(shape),
                                                   name=layer.name +
                                                   '_embedding')
                    embeddings_vars[layer.name] = embedding
                    batch = state_ops.assign(
                        embedding[batch_id:batch_id + step], embedding_input)
                    self.assign_embeddings.append(batch)

            self.saver = saver.Saver(list(embeddings_vars.values()))

            # Create embeddings_metadata dictionary
            if isinstance(self.embeddings_metadata, str):
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings_vars.keys()
                }
            else:
                # If embedding_metadata is already a dictionary
                embeddings_metadata = self.embeddings_metadata

            try:
                from tensorboard.plugins import projector
            except ImportError:
                raise ImportError(
                    'Failed to import TensorBoard. Please make sure that '
                    'TensorBoard integration is complete."')

            # TODO(psv): Add integration tests to test embedding visualization
            # with TensorBoard callback. We are unable to write a unit test for this
            # because TensorBoard dependency assumes TensorFlow package is installed.
            config = projector.ProjectorConfig()
            for layer_name, tensor in embeddings_vars.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                if (embeddings_metadata is not None
                        and layer_name in embeddings_metadata):
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Exemplo n.º 43
0
 def begin(self):
     if self._summary_op is None:
         self._summary_op = summary.merge_all()
Exemplo n.º 44
0
 def begin(self):
   if self._replace_summary_op:
     self._summary_op = summary.merge_all()
   self._global_step = variables.get_or_create_global_step()
Exemplo n.º 45
0
 def begin(self):
   self._summary_op = summary_lib.merge_all()
Exemplo n.º 46
0
    def set_model(self, model):
        self.model = model
        self.sess = K.get_session()
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:

                for weight in layer.weights:
                    tf_summary.histogram(weight.name, weight)
                    if self.write_images:
                        w_img = array_ops.squeeze(weight)
                        shape = w_img.get_shape()
                        if len(shape) > 1 and shape[0] > shape[1]:
                            w_img = array_ops.transpose(w_img)
                        if len(shape) == 1:
                            w_img = array_ops.expand_dims(w_img, 0)
                        w_img = array_ops.expand_dims(
                            array_ops.expand_dims(w_img, 0), -1)
                        tf_summary.image(weight.name, w_img)

                if hasattr(layer, 'output'):
                    tf_summary.histogram('{}_out'.format(layer.name),
                                         layer.output)
        self.merged = tf_summary.merge_all()

        if self.write_graph:
            self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
        else:
            self.writer = tf_summary.FileWriter(self.log_dir)

        if self.embeddings_freq:
            self.saver = saver_lib.Saver()

            embeddings_layer_names = self.embeddings_layer_names

            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            embeddings = {
                layer.name: layer.weights[0]
                for layer in self.model.layers
                if layer.name in embeddings_layer_names
            }

            embeddings_metadata = {}

            if not isinstance(self.embeddings_metadata, str):
                embeddings_metadata = self.embeddings_metadata
            else:
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings.keys()
                }

            config = projector.ProjectorConfig()
            self.embeddings_logs = []

            for layer_name, tensor in embeddings.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                self.embeddings_logs.append(
                    os.path.join(self.log_dir, layer_name + '.ckpt'))

                if layer_name in embeddings_metadata:
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Exemplo n.º 47
0
 def begin(self):
     if self._replace_summary_op:
         # This can still remain None if there are no summaries.
         self._summary_op = summary.merge_all()
     self._global_step = training_util.get_or_create_global_step()
  def build_controller(self):
    """RL optimization interface.

    Returns:
      ops: A dictionary holding handles of the model used for training.
    """

    self._global_step = training_util.get_or_create_global_step()
    ops = {}
    ops["loss"] = 0

    failing_signal = self.compute_reward(self.hparams.failing_signal)

    ctr = {}

    with tf_ops.name_scope("controller_{}".format(self.ctrl_id)):
      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
        ctr["reward"] = {"value": [], "ph": [], "update": []}
        ctr["ready"] = {"value": [], "ph": [], "update": []}
        ctr["best_reward"] = {"value": [], "update": []}
        for i in range(self.hparams.num_children):
          reward_value = variable_scope.get_local_variable(
              "reward_{}".format(i),
              initializer=0.0,
              dtype=dtypes.float32,
              trainable=False)
          reward_ph = array_ops.placeholder(
              dtypes.float32, shape=(), name="reward_ph_{}".format(i))
          reward_update = state_ops.assign(
              reward_value, reward_ph, use_locking=True)
          ctr["reward"]["value"].append(reward_value)
          ctr["reward"]["ph"].append(reward_ph)
          ctr["reward"]["update"].append(reward_update)
          best_reward = variable_scope.get_local_variable(
              "best_reward_{}".format(i),
              initializer=failing_signal,
              dtype=dtypes.float32,
              trainable=False)
          ctr["best_reward"]["value"].append(best_reward)
          ctr["best_reward"]["update"].append(
              state_ops.assign(best_reward,
                               math_ops.minimum(best_reward, reward_update)))

          ready_value = variable_scope.get_local_variable(
              "ready_{}".format(i),
              initializer=True,
              dtype=dtypes.bool,
              trainable=False)
          ready_ph = array_ops.placeholder(
              dtypes.bool, shape=(), name="ready_ph_{}".format(i))
          ready_update = state_ops.assign(
              ready_value, ready_ph, use_locking=True)
          ctr["ready"]["value"].append(ready_value)
          ctr["ready"]["ph"].append(ready_ph)
          ctr["ready"]["update"].append(ready_update)

      ctr["grouping_y_preds"], ctr["grouping_log_probs"] = self.get_groupings()
      summary.histogram(
          "grouping_actions",
          array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0],
                          [1, array_ops.shape(self.op_embeddings)[0]]))

      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
        ctr["baseline"] = variable_scope.get_local_variable(
            "baseline",
            initializer=failing_signal
            if self.hparams.start_with_failing_signal else 0.0,
            dtype=dtypes.float32,
            trainable=False)

      new_baseline = self.hparams.bl_dec * ctr["baseline"] + (
          1 - self.hparams.bl_dec) * math_ops.reduce_mean(
              ctr["reward"]["value"])
      if not self.hparams.always_update_baseline:
        baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal)
        selected_reward = array_ops.boolean_mask(ctr["reward"]["value"],
                                                 baseline_mask)
        selected_baseline = control_flow_ops.cond(
            math_ops.reduce_any(baseline_mask),
            lambda: math_ops.reduce_mean(selected_reward),
            lambda: constant_op.constant(0, dtype=dtypes.float32))
        ctr["pos_reward"] = selected_baseline
        pos_ = math_ops.less(
            constant_op.constant(0, dtype=dtypes.float32), selected_baseline)
        selected_baseline = self.hparams.bl_dec * ctr["baseline"] + (
            1 - self.hparams.bl_dec) * selected_baseline
        selected_baseline = control_flow_ops.cond(
            pos_, lambda: selected_baseline, lambda: ctr["baseline"])
        new_baseline = control_flow_ops.cond(
            math_ops.less(self.global_step,
                          self.hparams.stop_updating_after_steps),
            lambda: new_baseline, lambda: selected_baseline)
      ctr["baseline_update"] = state_ops.assign(
          ctr["baseline"], new_baseline, use_locking=True)

      ctr["y_preds"], ctr["log_probs"] = self.get_placements()
      summary.histogram("actions", ctr["y_preds"]["sample"])
      mask = math_ops.less(ctr["reward"]["value"], failing_signal)
      ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"]
      ctr["loss"] *= (
          ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"])

      selected_loss = array_ops.boolean_mask(ctr["loss"], mask)
      selected_loss = control_flow_ops.cond(
          math_ops.reduce_any(mask),
          lambda: math_ops.reduce_mean(-selected_loss),
          lambda: constant_op.constant(0, dtype=dtypes.float32))

      ctr["loss"] = control_flow_ops.cond(
          math_ops.less(self.global_step,
                        self.hparams.stop_updating_after_steps),
          lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss)

      ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"])
      summary.scalar("loss", ctr["loss"])
      summary.scalar("avg_reward", ctr["reward_s"])
      summary.scalar("best_reward_so_far", best_reward)
      summary.scalar(
          "advantage",
          math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"]))

    with variable_scope.variable_scope(
        "optimizer", reuse=variable_scope.AUTO_REUSE):
      (ctr["train_op"], ctr["lr"], ctr["grad_norm"],
       ctr["grad_norms"]) = self._get_train_ops(
           ctr["loss"],
           tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES),
           self.global_step,
           grad_bound=self.hparams.grad_bound,
           lr_init=self.hparams.lr,
           lr_dec=self.hparams.lr_dec,
           start_decay_step=self.hparams.start_decay_step,
           decay_steps=self.hparams.decay_steps,
           optimizer_type=self.hparams.optimizer_type)

    summary.scalar("gradnorm", ctr["grad_norm"])
    summary.scalar("lr", ctr["lr"])
    ctr["summary"] = summary.merge_all()
    ops["controller"] = ctr

    self.ops = ops
    return ops
Exemplo n.º 49
0
    def testAddRegularizationLossSummaries(self):
        summaries.add_regularization_loss_summaries(get_gan_model())

        self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
        with self.test_session(use_gpu=True):
            summary.merge_all().eval()
Exemplo n.º 50
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          session_wrapper=None,
          trace_every_n_steps=None,
          ignore_live_threads=False):
  """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step are logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then training_util.get_or_create_global_step(), that is,
      tf.contrib.framework.global_step() is used.
    number_of_steps: The max number of gradient steps to take during training,
      as measured by 'global_step': training will stop if global_step is
      greater than 'number_of_steps'. If the value is left as None, training
      proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of
      them. If the argument is supplied, gradient updates will be synchronous.
      If left as `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    session_wrapper: A function that takes a `tf.Session` object as the only
      argument and returns a wrapped session object that has the same methods
      that the original object has, or `None`. Iff not `None`, the wrapped
      object will be used for training.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.
    ignore_live_threads: If `True` ignores threads that remain running after
      a grace period when stopping the supervisor, instead of raising a
      RuntimeError.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
  if train_op is None:
    raise ValueError('train_op cannot be None.')

  if logdir is None:
    if summary_op != _USE_DEFAULT:
      raise ValueError('Cannot provide summary_op because logdir=None')
    if saver is not None:
      raise ValueError('Cannot provide saver because logdir=None')
    if trace_every_n_steps is not None:
      raise ValueError('Cannot provide trace_every_n_steps because '
                       'logdir=None')

  if isinstance(sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
    sync_optimizer = [sync_optimizer]
  if sync_optimizer is not None and startup_delay_steps > 0:
    raise ValueError(
        'startup_delay_steps must be zero when sync_optimizer is supplied.')

  if number_of_steps is not None and number_of_steps <= 0:
    raise ValueError(
        '`number_of_steps` must be either None or a positive number.')

  graph = graph or ops.get_default_graph()
  with graph.as_default():
    if global_step is None:
      global_step = training_util.get_or_create_global_step()
    saver = saver or tf_saver.Saver()

    if sync_optimizer is not None:
      for opt in sync_optimizer:
        if not isinstance(opt, sync_replicas_optimizer.SyncReplicasOptimizer):
          raise ValueError(
              '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.')

    with ops.name_scope('init_ops'):
      if init_op == _USE_DEFAULT:
        init_op = variables.global_variables_initializer()

      if ready_op == _USE_DEFAULT:
        ready_op = variables.report_uninitialized_variables()

      if local_init_op == _USE_DEFAULT:
        local_init_op = control_flow_ops.group(
            variables.local_variables_initializer(),
            lookup_ops.tables_initializer())

      if sync_optimizer is not None and isinstance(sync_optimizer, list):
        with ops.control_dependencies([local_init_op] if local_init_op is
                                      not None else []):
          if is_chief:
            local_init_op = control_flow_ops.group(
                *[opt.chief_init_op for opt in sync_optimizer])
          else:
            local_init_op = control_flow_ops.group(
                *[opt.local_step_init_op for opt in sync_optimizer])
        ready_for_local_init_op = control_flow_ops.group(
            *[opt.ready_for_local_init_op for opt in sync_optimizer])
      else:
        ready_for_local_init_op = None

    if summary_op == _USE_DEFAULT:
      summary_op = summary.merge_all()

    if summary_writer == _USE_DEFAULT:
      summary_writer = supervisor.Supervisor.USE_DEFAULT

    if is_chief and sync_optimizer is not None:
      # Need to create these BEFORE the supervisor finalizes the graph:
      init_tokens_op = [opt.get_init_tokens_op() for opt in sync_optimizer]
      chief_queue_runner = [
          opt.get_chief_queue_runner() for opt in sync_optimizer]

    if train_step_kwargs == _USE_DEFAULT:
      with ops.name_scope('train_step'):
        train_step_kwargs = {}

        if number_of_steps:
          should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
        else:
          should_stop_op = constant_op.constant(False)
        train_step_kwargs['should_stop'] = should_stop_op
        if log_every_n_steps > 0:
          train_step_kwargs['should_log'] = math_ops.equal(
              math_ops.mod(global_step, log_every_n_steps), 0)
        if is_chief and trace_every_n_steps is not None:
          train_step_kwargs['should_trace'] = math_ops.equal(
              math_ops.mod(global_step, trace_every_n_steps), 0)
          train_step_kwargs['logdir'] = logdir

  sv = supervisor.Supervisor(
      graph=graph,
      is_chief=is_chief,
      logdir=logdir,
      init_op=init_op,
      init_feed_dict=init_feed_dict,
      local_init_op=local_init_op,
      ready_for_local_init_op=ready_for_local_init_op,
      ready_op=ready_op,
      summary_op=summary_op,
      summary_writer=summary_writer,
      global_step=global_step,
      saver=saver,
      save_summaries_secs=save_summaries_secs,
      save_model_secs=save_interval_secs,
      init_fn=init_fn)

  if summary_writer is not None:
    train_step_kwargs['summary_writer'] = sv.summary_writer

  total_loss = None
  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with sv.managed_session(
          master, start_standard_services=False, config=session_config) as sess:
        logging.info('Starting Session.')
        if session_wrapper is not None:
          logging.info(
              'Wrapping session with wrapper function: %s', session_wrapper)
          sess = session_wrapper(sess)
        if is_chief:
          if logdir:
            sv.start_standard_services(sess)
        elif startup_delay_steps > 0:
           # (use sys.maxsize because sys.maxint doesn't exist in Python 3)
          _wait_for_step(sess, global_step,
                         min(startup_delay_steps, number_of_steps or
                             sys.maxsize))
        threads = sv.start_queue_runners(sess)
        logging.info('Starting Queues.')
        if is_chief and sync_optimizer is not None:
          sv.start_queue_runners(sess, chief_queue_runner)
          sess.run(init_tokens_op)
        try:
          while not sv.should_stop():
            total_loss, should_stop = train_step_fn(
                sess, train_op, global_step, train_step_kwargs)
            if should_stop:
              logging.info('Stopping Training.')
              sv.request_stop()
              break
        except errors.OutOfRangeError as e:
          # OutOfRangeError is thrown when epoch limit per
          # tf.train.limit_epochs is reached.
          logging.info('Caught OutOfRangeError. Stopping Training. %s', e)
        if logdir and sv.is_chief:
          logging.info('Finished training! Saving model to disk.')
          sv.saver.save(sess, sv.save_path, global_step=sv.global_step)
          sv.stop(
              threads,
              close_summary_writer=True,
              ignore_live_threads=ignore_live_threads)

    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      logging.info('Retrying training!')
      should_retry = True

  return total_loss
Exemplo n.º 51
0
def evaluation_loop(master,
                    checkpoint_dir,
                    logdir,
                    num_evals=1,
                    initial_op=None,
                    initial_op_feed_dict=None,
                    eval_op=None,
                    eval_op_feed_dict=None,
                    final_op=None,
                    final_op_feed_dict=None,
                    summary_op=_USE_DEFAULT,
                    summary_op_feed_dict=None,
                    variables_to_restore=None,
                    eval_interval_secs=60,
                    max_number_of_evaluations=None,
                    session_config=None,
                    timeout=None,
                    hooks=None):
    """Runs TF-Slim's Evaluation Loop.
    Args:
      master: The BNS address of the TensorFlow master.
      checkpoint_dir: The directory where checkpoints are stored.
      logdir: The directory where the TensorFlow summaries are written to.
      num_evals: The number of times to run `eval_op`.
      initial_op: An operation run at the beginning of evaluation.
      initial_op_feed_dict: A feed dictionary to use when executing `initial_op`.
      eval_op: A operation run `num_evals` times.
      eval_op_feed_dict: The feed dictionary to use when executing the `eval_op`.
      final_op: An operation to execute after all of the `eval_op` executions. The
        value of `final_op` is returned.
      final_op_feed_dict: A feed dictionary to use when executing `final_op`.
      summary_op: The summary_op to evaluate after running TF-Slims metric ops. By
        default the summary_op is set to tf.summary.merge_all().
      summary_op_feed_dict: An optional feed dictionary to use when running the
        `summary_op`.
      variables_to_restore: A list of TensorFlow variables to restore during
        evaluation. If the argument is left as `None` then
        slim.variables.GetVariablesToRestore() is used.
      eval_interval_secs: The minimum number of seconds between evaluations.
      max_number_of_evaluations: the max number of iterations of the evaluation.
        If the value is left as 'None', the evaluation continues indefinitely.
      session_config: An instance of `tf.ConfigProto` that will be used to
        configure the `Session`. If left as `None`, the default will be used.
      timeout: The maximum amount of time to wait between checkpoints. If left as
        `None`, then the process will wait indefinitely.
      hooks: Hooks to use while evaluating
    Returns:
      The value of `final_op` or `None` if `final_op` is `None`.
    """

    if summary_op == _USE_DEFAULT:
        summary_op = summary.merge_all()

    if hooks is None:
        hooks = []
    hooks.append(evaluation.StopAfterNEvalsHook(num_evals))

    if summary_op is not None:
        hooks.append(evaluation.SummaryAtEndHook(
            log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict))

    saver = None
    if variables_to_restore is not None:
        saver = tf_saver.Saver(variables_to_restore)

    return evaluation.evaluate_repeatedly(
        checkpoint_dir,
        master=master,
        scaffold=monitored_session.Scaffold(
            init_op=initial_op, init_feed_dict=initial_op_feed_dict, saver=saver),
        eval_ops=eval_op,
        feed_dict=eval_op_feed_dict,
        final_ops=final_op,
        final_ops_feed_dict=final_op_feed_dict,
        eval_interval_secs=eval_interval_secs,
        hooks=hooks,
        config=session_config,
        max_number_of_evaluations=max_number_of_evaluations,
        timeout=timeout)
Exemplo n.º 52
0
 def begin(self):
   if self._summary_op is None:
     self._summary_op = summary.merge_all()
Exemplo n.º 53
0
    def set_model(self, model):
        """Sets Keras model and creates summary ops."""

        self.model = model
        self._init_writer(model)
        # histogram summaries only enabled in graph mode
        if not context.executing_eagerly():
            self._make_histogram_ops(model)
            self.merged = tf_summary.merge_all()

        # If both embedding_freq and embeddings_data are available, we will
        # visualize embeddings.
        if self.embeddings_freq and self.embeddings_data is not None:
            # Avoid circular dependency.
            from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
            self.embeddings_data = training_utils.standardize_input_data(
                self.embeddings_data, model.input_names)

            # If embedding_layer_names are not provided, get all of the embedding
            # layers from the model.
            embeddings_layer_names = self.embeddings_layer_names
            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            self.assign_embeddings = []
            embeddings_vars = {}

            self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
            self.step = step = array_ops.placeholder(dtypes.int32)

            for layer in self.model.layers:
                if layer.name in embeddings_layer_names:
                    embedding_input = self.model.get_layer(layer.name).output
                    embedding_size = np.prod(embedding_input.shape[1:])
                    embedding_input = array_ops.reshape(
                        embedding_input, (step, int(embedding_size)))
                    shape = (self.embeddings_data[0].shape[0],
                             int(embedding_size))
                    embedding = variables.Variable(array_ops.zeros(shape),
                                                   name=layer.name +
                                                   '_embedding')
                    embeddings_vars[layer.name] = embedding
                    batch = state_ops.assign(
                        embedding[batch_id:batch_id + step], embedding_input)
                    self.assign_embeddings.append(batch)

            self.saver = saver.Saver(list(embeddings_vars.values()))

            # Create embeddings_metadata dictionary
            if isinstance(self.embeddings_metadata, str):
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings_vars.keys()
                }
            else:
                # If embedding_metadata is already a dictionary
                embeddings_metadata = self.embeddings_metadata

            try:
                from tensorboard.plugins import projector
            except ImportError:
                raise ImportError(
                    'Failed to import TensorBoard. Please make sure that '
                    'TensorBoard integration is complete."')

            # TODO(psv): Add integration tests to test embedding visualization
            # with TensorBoard callback. We are unable to write a unit test for this
            # because TensorBoard dependency assumes TensorFlow package is installed.
            config = projector.ProjectorConfig()
            for layer_name, tensor in embeddings_vars.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                if (embeddings_metadata is not None
                        and layer_name in embeddings_metadata):
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Exemplo n.º 54
0
    def build_controller(self):
        """RL optimization interface.

    Returns:
      ops: A dictionary holding handles of the model used for training.
    """

        self._global_step = training_util.get_or_create_global_step()
        ops = {}
        ops["loss"] = 0

        failing_signal = self.compute_reward(self.hparams.failing_signal)

        ctr = {}

        with tf_ops.name_scope("controller_{}".format(self.ctrl_id)):
            with variable_scope.variable_scope("controller_{}".format(
                    self.ctrl_id)):
                ctr["reward"] = {"value": [], "ph": [], "update": []}
                ctr["ready"] = {"value": [], "ph": [], "update": []}
                ctr["best_reward"] = {"value": [], "update": []}
                for i in range(self.hparams.num_children):
                    reward_value = variable_scope.get_local_variable(
                        "reward_{}".format(i),
                        initializer=0.0,
                        dtype=dtypes.float32,
                        trainable=False)
                    reward_ph = array_ops.placeholder(
                        dtypes.float32,
                        shape=(),
                        name="reward_ph_{}".format(i))
                    reward_update = state_ops.assign(reward_value,
                                                     reward_ph,
                                                     use_locking=True)
                    ctr["reward"]["value"].append(reward_value)
                    ctr["reward"]["ph"].append(reward_ph)
                    ctr["reward"]["update"].append(reward_update)
                    best_reward = variable_scope.get_local_variable(
                        "best_reward_{}".format(i),
                        initializer=failing_signal,
                        dtype=dtypes.float32,
                        trainable=False)
                    ctr["best_reward"]["value"].append(best_reward)
                    ctr["best_reward"]["update"].append(
                        state_ops.assign(
                            best_reward,
                            math_ops.minimum(best_reward, reward_update)))

                    ready_value = variable_scope.get_local_variable(
                        "ready_{}".format(i),
                        initializer=True,
                        dtype=dtypes.bool,
                        trainable=False)
                    ready_ph = array_ops.placeholder(
                        dtypes.bool, shape=(), name="ready_ph_{}".format(i))
                    ready_update = state_ops.assign(ready_value,
                                                    ready_ph,
                                                    use_locking=True)
                    ctr["ready"]["value"].append(ready_value)
                    ctr["ready"]["ph"].append(ready_ph)
                    ctr["ready"]["update"].append(ready_update)

            ctr["grouping_y_preds"], ctr[
                "grouping_log_probs"] = self.get_groupings()
            summary.histogram(
                "grouping_actions",
                array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0],
                                [1, array_ops.shape(self.op_embeddings)[0]]))

            with variable_scope.variable_scope("controller_{}".format(
                    self.ctrl_id)):
                ctr["baseline"] = variable_scope.get_local_variable(
                    "baseline",
                    initializer=failing_signal
                    if self.hparams.start_with_failing_signal else 0.0,
                    dtype=dtypes.float32,
                    trainable=False)

            new_baseline = self.hparams.bl_dec * ctr["baseline"] + (
                1 - self.hparams.bl_dec) * math_ops.reduce_mean(
                    ctr["reward"]["value"])
            if not self.hparams.always_update_baseline:
                baseline_mask = math_ops.less(ctr["reward"]["value"],
                                              failing_signal)
                selected_reward = array_ops.boolean_mask(
                    ctr["reward"]["value"], baseline_mask)
                selected_baseline = control_flow_ops.cond(
                    math_ops.reduce_any(baseline_mask),
                    lambda: math_ops.reduce_mean(selected_reward),
                    lambda: constant_op.constant(0, dtype=dtypes.float32))
                ctr["pos_reward"] = selected_baseline
                pos_ = math_ops.less(
                    constant_op.constant(0, dtype=dtypes.float32),
                    selected_baseline)
                selected_baseline = self.hparams.bl_dec * ctr["baseline"] + (
                    1 - self.hparams.bl_dec) * selected_baseline
                selected_baseline = control_flow_ops.cond(
                    pos_, lambda: selected_baseline, lambda: ctr["baseline"])
                new_baseline = control_flow_ops.cond(
                    math_ops.less(self.global_step,
                                  self.hparams.stop_updating_after_steps),
                    lambda: new_baseline, lambda: selected_baseline)
            ctr["baseline_update"] = state_ops.assign(ctr["baseline"],
                                                      new_baseline,
                                                      use_locking=True)

            ctr["y_preds"], ctr["log_probs"] = self.get_placements()
            summary.histogram("actions", ctr["y_preds"]["sample"])
            mask = math_ops.less(ctr["reward"]["value"], failing_signal)
            ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"]
            ctr["loss"] *= (ctr["log_probs"]["sample"] +
                            ctr["grouping_log_probs"]["sample"])

            selected_loss = array_ops.boolean_mask(ctr["loss"], mask)
            selected_loss = control_flow_ops.cond(
                math_ops.reduce_any(mask),
                lambda: math_ops.reduce_mean(-selected_loss),
                lambda: constant_op.constant(0, dtype=dtypes.float32))

            ctr["loss"] = control_flow_ops.cond(
                math_ops.less(self.global_step,
                              self.hparams.stop_updating_after_steps),
                lambda: math_ops.reduce_mean(-ctr["loss"]),
                lambda: selected_loss)

            ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"])
            summary.scalar("loss", ctr["loss"])
            summary.scalar("avg_reward", ctr["reward_s"])
            summary.scalar("best_reward_so_far", best_reward)
            summary.scalar(
                "advantage",
                math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"]))

        with variable_scope.variable_scope("optimizer",
                                           reuse=variable_scope.AUTO_REUSE):
            (ctr["train_op"], ctr["lr"], ctr["grad_norm"],
             ctr["grad_norms"]) = self._get_train_ops(
                 ctr["loss"],
                 tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES),
                 self.global_step,
                 grad_bound=self.hparams.grad_bound,
                 lr_init=self.hparams.lr,
                 lr_dec=self.hparams.lr_dec,
                 start_decay_step=self.hparams.start_decay_step,
                 decay_steps=self.hparams.decay_steps,
                 optimizer_type=self.hparams.optimizer_type)

        summary.scalar("gradnorm", ctr["grad_norm"])
        summary.scalar("lr", ctr["lr"])
        ctr["summary"] = summary.merge_all()
        ops["controller"] = ctr

        self.ops = ops
        return ops
Exemplo n.º 55
0
def train(train_op,
          logdir,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_steps=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

  When the sync_optimizer is supplied, gradient updates are applied
  synchronously. Otherwise, gradient updates are applied asynchronous.

  Args:
    train_op: A `Tensor` that, when executed, will apply the gradients and
      return the loss value.
    logdir: The directory where training logs are written to. If None, model
      checkpoints and summaries will not be written.
    train_step_fn: The function to call in order to execute a single gradient
      step. The function must have take exactly four arguments: the current
      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
      are provided.
    log_every_n_steps: The frequency, in terms of global steps, that the loss
      and global step and logged.
    graph: The graph to pass to the supervisor. If no graph is supplied the
      default graph is used.
    master: The address of the tensorflow master.
    is_chief: Specifies whether or not the training is being run by the primary
      replica during replica training.
    global_step: The `Tensor` representing the global step. If left as `None`,
      then slim.variables.get_or_create_global_step() is used.
    number_of_steps: The max number of gradient steps to take during training.
      If the value is left as None, training proceeds indefinitely.
    init_op: The initialization operation. If left to its default value, then
      the session is initialized by calling `tf.global_variables_initializer()`.
    init_feed_dict: A feed dictionary to use when executing the `init_op`.
    local_init_op: The local initialization operation. If left to its default
      value, then the session is initialized by calling
      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
    init_fn: An optional callable to be executed after `init_op` is called. The
      callable must accept one argument, the session being initialized.
    ready_op: Operation to check if the model is ready to use. If left to its
      default value, then the session checks for readiness by calling
      `tf.report_uninitialized_variables()`.
    summary_op: The summary operation.
    save_summaries_secs: How often, in seconds, to save summaries.
    summary_writer: `SummaryWriter` to use.  Can be `None`
      to indicate that no summaries should be written. If unset, we
      create a SummaryWriter.
    startup_delay_steps: The number of steps to wait for before beginning. Note
      that this must be 0 if a sync_optimizer is supplied.
    saver: Saver to save checkpoints. If None, a default one will be created
      and used.
    save_interval_secs: How often, in seconds, to save the model to `logdir`.
    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
      argument is supplied, gradient updates will be synchronous. If left as
      `None`, gradient updates will be asynchronous.
    session_config: An instance of `tf.ConfigProto` that will be used to
      configure the `Session`. If left as `None`, the default will be used.
    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
      and add it to the summaries every `trace_every_n_steps`. If None, no trace
      information will be produced or saved.

  Returns:
    the value of the loss function after training.

  Raises:
    ValueError: if `train_op` is empty or if `startup_delay_steps` is
      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
      provided.
  """
    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.'
        )

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.local_variables_initializer(),
                    data_flow_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(
                    sync_optimizer,
                    sync_replicas_optimizer.SyncReplicasOptimizer):
                with ops.control_dependencies(
                    [local_init_op] if local_init_op is not None else []):
                    if is_chief:
                        local_init_op = sync_optimizer.chief_init_op
                    else:
                        local_init_op = sync_optimizer.local_step_init_op
                ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            if not isinstance(sync_optimizer,
                              (sync_replicas_optimizer.SyncReplicasOptimizer)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.'
                )

            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = sync_optimizer.get_init_tokens_op()
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(
                        global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir

    sv = supervisor.Supervisor(graph=graph,
                               is_chief=is_chief,
                               logdir=logdir,
                               init_op=init_op,
                               init_feed_dict=init_feed_dict,
                               local_init_op=local_init_op,
                               ready_for_local_init_op=ready_for_local_init_op,
                               ready_op=ready_op,
                               summary_op=summary_op,
                               summary_writer=summary_writer,
                               global_step=global_step,
                               saver=saver,
                               save_summaries_secs=save_summaries_secs,
                               save_model_secs=save_interval_secs,
                               init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False
            with sv.managed_session(master,
                                    start_standard_services=False,
                                    config=session_config) as sess:
                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(
                        sess, global_step,
                        min(startup_delay_steps, number_of_steps
                            or sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                    sess.run(init_tokens_op)
                try:
                    while not sv.should_stop():
                        total_loss, should_stop = train_step_fn(
                            sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                except errors.OutOfRangeError:
                    # OutOfRangeError is thrown when epoch limit per
                    # tf.train.limit_epochs is reached.
                    logging.info('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    logging.info('Finished training! Saving model to disk.')
                    sv.saver.save(sess,
                                  sv.save_path,
                                  global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Exemplo n.º 56
0
  def set_model(self, model):
    """Sets Keras model and creates summary ops."""

    self.model = model
    self._init_writer(model)
    # histogram summaries only enabled in graph mode
    if not context.executing_eagerly():
      self._make_histogram_ops(model)
      self.merged = tf_summary.merge_all()

    # If both embedding_freq and embeddings_data are available, we will
    # visualize embeddings.
    if self.embeddings_freq and self.embeddings_data is not None:
      # Avoid circular dependency.
      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
      self.embeddings_data = training_utils.standardize_input_data(
          self.embeddings_data, model.input_names)

      # If embedding_layer_names are not provided, get all of the embedding
      # layers from the model.
      embeddings_layer_names = self.embeddings_layer_names
      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name
            for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      self.assign_embeddings = []
      embeddings_vars = {}

      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
      self.step = step = array_ops.placeholder(dtypes.int32)

      for layer in self.model.layers:
        if layer.name in embeddings_layer_names:
          embedding_input = self.model.get_layer(layer.name).output
          embedding_size = np.prod(embedding_input.shape[1:])
          embedding_input = array_ops.reshape(embedding_input,
                                              (step, int(embedding_size)))
          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
          embedding = variables.Variable(
              array_ops.zeros(shape), name=layer.name + '_embedding')
          embeddings_vars[layer.name] = embedding
          batch = state_ops.assign(embedding[batch_id:batch_id + step],
                                   embedding_input)
          self.assign_embeddings.append(batch)

      self.saver = saver.Saver(list(embeddings_vars.values()))

      # Create embeddings_metadata dictionary
      if isinstance(self.embeddings_metadata, str):
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings_vars.keys()
        }
      else:
        # If embedding_metadata is already a dictionary
        embeddings_metadata = self.embeddings_metadata

      try:
        from tensorboard.plugins import projector
      except ImportError:
        raise ImportError('Failed to import TensorBoard. Please make sure that '
                          'TensorBoard integration is complete."')

      # TODO(psv): Add integration tests to test embedding visualization
      # with TensorBoard callback. We are unable to write a unit test for this
      # because TensorBoard dependency assumes TensorFlow package is installed.
      config = projector.ProjectorConfig()
      for layer_name, tensor in embeddings_vars.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        if (embeddings_metadata is not None and
            layer_name in embeddings_metadata):
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
Exemplo n.º 57
0
def train(train_op,
          logdir,
          metric_op=None,
          metric_collection_name=None,
          train_step_fn=train_step,
          train_step_kwargs=_USE_DEFAULT,
          log_every_n_steps=1,
          graph=None,
          master='',
          is_chief=True,
          global_step=None,
          number_of_samples=None,
          number_of_steps=None,
          number_of_epochs=None,
          batch_size=None,
          init_op=_USE_DEFAULT,
          init_feed_dict=None,
          local_init_op=_USE_DEFAULT,
          init_fn=None,
          ready_op=_USE_DEFAULT,
          summary_op=_USE_DEFAULT,
          save_summaries_secs=600,
          summary_writer=_USE_DEFAULT,
          startup_delay_steps=0,
          saver=None,
          save_interval_secs=600,
          sync_optimizer=None,
          session_config=None,
          trace_every_n_steps=None):
    """Runs a training loop using a TensorFlow supervisor.

    When the sync_optimizer is supplied, gradient updates are applied
    synchronously. Otherwise, gradient updates are applied asynchronous.

    Args:
      train_op: A `Tensor` that, when executed, will apply the gradients and
        return the loss value.

      metric_op: A `Tensor` that, when executed, will update the streaming_metrics ops.
      metric_collection_name: The name associated with the metric_op.
      logdir: The directory where training logs are written to. If None, model
        checkpoints and summaries will not be written.
      train_step_fn: The function to call in order to execute a single gradient
        step. The function must have take exactly four arguments: the current
        session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
      train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
        default, two `Boolean`, scalar ops called "should_stop" and "should_log"
        are provided.
      log_every_n_steps: The frequency, in terms of global steps, that the loss
        and global step and logged.
      graph: The graph to pass to the supervisor. If no graph is supplied the
        default graph is used.
      master: The address of the tensorflow master.
      is_chief: Specifies whether or not the training is being run by the primary
        replica during replica training.
      global_step: The `Tensor` representing the global step. If left as `None`,
        then slim.variables.get_or_create_global_step() is used.
      number_of_steps: The max number of gradient steps to take during training,
        as measured by 'global_step': training will stop if global_step is
        greater than 'number_of_steps'. If the value is left as None, training
        proceeds indefinitely.
      number_of_epochs: The total number of epochs per training.
      batch_size: The number of samples in each batch.
      init_op: The initialization operation. If left to its default value, then
        the session is initialized by calling `tf.global_variables_initializer()`.
      init_feed_dict: A feed dictionary to use when executing the `init_op`.
      local_init_op: The local initialization operation. If left to its default
        value, then the session is initialized by calling
        `tf.local_variables_initializer()` and `tf.tables_initializer()`.
      init_fn: An optional callable to be executed after `init_op` is called. The
        callable must accept one argument, the session being initialized.
      ready_op: Operation to check if the model is ready to use. If left to its
        default value, then the session checks for readiness by calling
        `tf.report_uninitialized_variables()`.
      summary_op: The summary operation.
      save_summaries_secs: How often, in seconds, to save summaries.
      summary_writer: `SummaryWriter` to use.  Can be `None`
        to indicate that no summaries should be written. If unset, we
        create a SummaryWriter.
      startup_delay_steps: The number of steps to wait for before beginning. Note
        that this must be 0 if a sync_optimizer is supplied.
      saver: Saver to save checkpoints. If None, a default one will be created
        and used.
      save_interval_secs: How often, in seconds, to save the model to `logdir`.
      sync_optimizer: an instance of tf.train.SyncReplicasOptimizer. If the
        argument is supplied, gradient updates will be synchronous. If left as
        `None`, gradient updates will be asynchronous.
      session_config: An instance of `tf.ConfigProto` that will be used to
        configure the `Session`. If left as `None`, the default will be used.
      trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
        and add it to the summaries every `trace_every_n_steps`. If None, no trace
        information will be produced or saved.

    Returns:
      the value of the loss function after training.

    Raises:
      ValueError: if `train_op` is empty or if `startup_delay_steps` is
        non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
        negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
        provided.
    """

    # Check if the calculation of some metrics is desired.
    if metric_op is not None:

        # Check the necessary requirements
        if metric_collection_name is None:
            raise ValueError('metric_collection_name must be fed and cannot be No')
        if number_of_samples is None:
            raise ValueError('number_of_samples must be fed and cannot be No')
        if number_of_steps is None:
            raise ValueError('number_of_steps must be fed and cannot be No')
        if number_of_epochs is None:
            raise ValueError('number_of_epochs must be fed and cannot be No')
        if batch_size is None:
            raise ValueError('batch_size must be fed and cannot be None')

    if train_op is None:
        raise ValueError('train_op cannot be None.')

    if logdir is None:
        if summary_op != _USE_DEFAULT:
            raise ValueError('Cannot provide summary_op because logdir=None')
        if saver is not None:
            raise ValueError('Cannot provide saver because logdir=None')
        if trace_every_n_steps is not None:
            raise ValueError('Cannot provide trace_every_n_steps because '
                             'logdir=None')

    if sync_optimizer is not None and startup_delay_steps > 0:
        raise ValueError(
            'startup_delay_steps must be zero when sync_optimizer is supplied.')

    if number_of_steps is not None and number_of_steps <= 0:
        raise ValueError(
            '`number_of_steps` must be either None or a positive number.')

    graph = graph or ops.get_default_graph()
    with graph.as_default():
        if global_step is None:
            global_step = variables.get_or_create_global_step()
        saver = saver or tf_saver.Saver()

        with ops.name_scope('init_ops'):
            if init_op == _USE_DEFAULT:
                init_op = tf_variables.global_variables_initializer()

            if ready_op == _USE_DEFAULT:
                ready_op = tf_variables.report_uninitialized_variables()

            if local_init_op == _USE_DEFAULT:
                local_init_op = control_flow_ops.group(
                    tf_variables.local_variables_initializer(),
                    lookup_ops.tables_initializer())

            if sync_optimizer is not None and isinstance(
                    sync_optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
                with ops.control_dependencies([local_init_op] if local_init_op is
                not None else []):
                    if is_chief:
                        local_init_op = sync_optimizer.chief_init_op
                    else:
                        local_init_op = sync_optimizer.local_step_init_op
                ready_for_local_init_op = sync_optimizer.ready_for_local_init_op
            else:
                ready_for_local_init_op = None

        if summary_op == _USE_DEFAULT:
            summary_op = summary.merge_all()

        if summary_writer == _USE_DEFAULT:
            summary_writer = supervisor.Supervisor.USE_DEFAULT

        if is_chief and sync_optimizer is not None:
            if not isinstance(sync_optimizer,
                              (sync_replicas_optimizer.SyncReplicasOptimizer)):
                raise ValueError(
                    '`sync_optimizer` must be a tf.train.SyncReplicasOptimizer.')

            # Need to create these BEFORE the supervisor finalizes the graph:
            init_tokens_op = sync_optimizer.get_init_tokens_op()
            chief_queue_runner = sync_optimizer.get_chief_queue_runner()

        if train_step_kwargs == _USE_DEFAULT:
            with ops.name_scope('train_step'):
                train_step_kwargs = {}

                if number_of_steps:
                    should_stop_op = math_ops.greater_equal(global_step, number_of_steps)
                else:
                    should_stop_op = constant_op.constant(False)
                train_step_kwargs['should_stop'] = should_stop_op
                train_step_kwargs['should_log'] = math_ops.equal(
                    math_ops.mod(global_step, log_every_n_steps), 0)
                if is_chief and trace_every_n_steps is not None:
                    train_step_kwargs['should_trace'] = math_ops.equal(
                        math_ops.mod(global_step, trace_every_n_steps), 0)
                    train_step_kwargs['logdir'] = logdir
                if number_of_samples is not None and batch_size is not None:
                    train_step_kwargs['num_batches_per_epoch'] = int(number_of_samples / float(batch_size))
                if number_of_samples is not None and batch_size is not None:
                    train_step_kwargs['num_steps_per_epoch'] = int(number_of_steps / float(number_of_epochs))

        # If metric calculation is desired.
        if metric_op is not None:
            # The reset_op is defined for resetting the streaming_variables(streaming_acurracy,...)
            # The reason for defining it here is that the supervisor finalized the graph and the graph will be fixed after it.
            # By calling the reset_op in the train_step function, after each epoch the total & count variables will reset to zero.
            # This help to have the averaged accuracy per epoch which is useful to realize if we are getting to the highest accuracy in the training.
            stream_vars = [i for i in tf.local_variables() if i.name.split('/')[1] == metric_collection_name]
            reset_op = tf.variables_initializer(stream_vars)

    sv = supervisor.Supervisor(
        graph=graph,
        is_chief=is_chief,
        logdir=logdir,
        init_op=init_op,
        init_feed_dict=init_feed_dict,
        local_init_op=local_init_op,
        ready_for_local_init_op=ready_for_local_init_op,
        ready_op=ready_op,
        summary_op=summary_op,
        summary_writer=summary_writer,
        global_step=global_step,
        saver=saver,
        save_summaries_secs=save_summaries_secs,
        save_model_secs=save_interval_secs,
        init_fn=init_fn)

    if summary_writer is not None:
        train_step_kwargs['summary_writer'] = sv.summary_writer

    should_retry = True
    while should_retry:
        try:
            should_retry = False

            with sv.managed_session(
                    master, start_standard_services=False, config=session_config) as sess:

                logging.info('Starting Session.')
                if is_chief:
                    if logdir:
                        sv.start_standard_services(sess)
                elif startup_delay_steps > 0:
                    _wait_for_step(sess, global_step,
                                   min(startup_delay_steps, number_of_steps or
                                       sys.maxint))
                sv.start_queue_runners(sess)
                logging.info('Starting Queues.')
                if is_chief and sync_optimizer is not None:
                    sv.start_queue_runners(sess, [chief_queue_runner])
                    sess.run(init_tokens_op)
                try:
                    while not sv.should_stop():
                        if metric_op is not None:
                            total_loss, should_stop = train_step_fn(
                                sess, train_op, global_step, train_step_kwargs, metric_op, reset_op)
                        else:
                            total_loss, should_stop = train_step_fn(
                                sess, train_op, global_step, train_step_kwargs)
                        if should_stop:
                            logging.info('Stopping Training.')
                            break
                except errors.OutOfRangeError:
                    # OutOfRangeError is thrown when epoch limit per
                    # tf.train.limit_epochs is reached.
                    logging.info('Caught OutOfRangeError. Stopping Training.')
                if logdir and sv.is_chief:
                    logging.info('Finished training! Saving model to disk.')
                    sv.saver.save(sess, sv.save_path, global_step=sv.global_step)

        except errors.AbortedError:
            # Always re-run on AbortedError as it indicates a restart of one of the
            # distributed tensorflow servers.
            logging.info('Retrying training!')
            should_retry = True

    return total_loss
Exemplo n.º 58
0
 def begin(self):
     if self._summary_writer is None and self._log_dir:
         self._summary_writer = summary.FileWriterCache.get(self._log_dir)
     if self._summary_op is None:
         self._summary_op = summary.merge_all()