def testConcurrentExecutionUpdateAndRandomRead(self):
    circular_buffer_size = -1
    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                   self.tfdbg_run_id,
                                                   circular_buffer_size)

    writer_state = {"counter": 0, "done": False}

    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
      def write_and_update_job():
        while True:
          if writer_state["done"]:
            break
          execution = debug_event_pb2.Execution()
          execution.op_type = "OpType%d" % writer_state["counter"]
          writer_state["counter"] += 1
          writer.WriteExecution(execution)
          writer.FlushExecutionFiles()
          reader.update()
      # On the sub-thread, keep writing and reading new Execution protos.
      write_and_update_thread = threading.Thread(target=write_and_update_job)
      write_and_update_thread.start()
      # On the main thread, do concurrent random read.
      while True:
        exec_digests = reader.executions(digest=True)
        if exec_digests:
          exec_0 = reader.read_execution(exec_digests[0])
          self.assertEqual(exec_0.op_type, "OpType0")
          writer_state["done"] = True
          break
        else:
          time.sleep(0.1)
          continue
      write_and_update_thread.join()
  def testOnExecutionIsCalled(self, tensor_debug_mode):
    writer = dumping_callback.enable_dump_debug_info(
        self.dump_root, tensor_debug_mode=tensor_debug_mode)
    x = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float32)
    y = constant_op.constant([[-1], [1]], dtype=dtypes.float32)
    math_ops.matmul(x, y)
    writer.FlushNonExecutionFiles()
    writer.FlushExecutionFiles()

    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
      test_monitor = TestMonitor(reader)
      reader.update()
      self.assertLen(test_monitor.executions, 1)
      self.assertEmpty(test_monitor.graph_execution_traces)
      execution = test_monitor.executions[0]
      self.assertTrue(execution.wall_time)
      self.assertEqual(execution.op_type, "MatMul")
      self.assertLen(execution.output_tensor_device_ids, 1)
      self.assertLen(execution.input_tensor_ids, 2)
      self.assertLen(execution.output_tensor_ids, 1)
      self.assertEqual(execution.num_outputs, 1)
      self.assertEqual(execution.graph_id, "")
      if tensor_debug_mode == "NO_TENSOR":
        self.assertIsNone(execution.debug_tensor_values)
      elif tensor_debug_mode == "CONCISE_HEALTH":
        self.assertLen(execution.debug_tensor_values, 1)
        # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count].
        self.assertLen(execution.debug_tensor_values[0], 5)
      elif tensor_debug_mode == "FULL_TENSOR":
        # Full tensor values are not stored in the debug_tensor_values field.
        self.assertIsNone(execution.debug_tensor_values)
        self.assertAllClose(
            reader.execution_to_tensor_values(execution), [[[1.], [1.]]])
  def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin,
                                           expected_end):
    writer = debug_events_writer.DebugEventsWriter(
        self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
    debugged_graph = debug_event_pb2.DebuggedGraph(
        graph_id="graph1", graph_name="graph1")
    writer.WriteDebuggedGraph(debugged_graph)
    for i in range(5):
      op_name = "Op_%d" % i
      graph_op_creation = debug_event_pb2.GraphOpCreation(
          op_name=op_name, graph_id="graph1")
      writer.WriteGraphOpCreation(graph_op_creation)
      trace = debug_event_pb2.GraphExecutionTrace(
          op_name=op_name, tfdbg_context_id="graph1")
      writer.WriteGraphExecutionTrace(trace)
    writer.FlushNonExecutionFiles()
    writer.FlushExecutionFiles()
    writer.Close()

    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
      reader.update()
      traces = reader.graph_execution_traces(begin=begin, end=end)
    self.assertLen(traces, expected_end - expected_begin)
    self.assertEqual(traces[0].op_name, "Op_%d" % expected_begin)
    self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
예제 #4
0
    def Runs(self):
        """Return all the run names in the `EventMultiplexer`.

        The `Run()` method of this class is specialized for the tfdbg2-format
        DebugEvent files. It only returns runs

        Returns:
        If tfdbg2-format data exists in the `logdir` of this object, returns:
            ```
            {runName: { "debugger-v2": [tag1, tag2, tag3] } }
            ```
            where `runName` is the hard-coded string `DEFAULT_DEBUGGER_RUN_NAME`
            string. This is related to the fact that tfdbg2 currently contains
            at most one DebugEvent file set per directory.
        If no tfdbg2-format data exists in the `logdir`, an empty `dict`.
        """
        reader = None
        from tensorflow.python.debug.lib import debug_events_reader

        try:
            reader = debug_events_reader.DebugDataReader(self._logdir)
            # NOTE(cais): Currently each logdir is enforced to have only one
            # DebugEvent file set. So we add hard-coded default run name.
        except ValueError as error:
            # When no DebugEvent file set is found in the logdir, a `ValueError`
            # is thrown.
            return {}
        with reader:
            return {
                DEFAULT_DEBUGGER_RUN_NAME: {
                    # TODO(cais): Add the semantically meaningful tag names such as
                    # 'execution_digests_book', 'alerts_book'
                    "debugger-v2": []
                }
            }
  def testConcurrentSourceFileRandomReads(self):
    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                   self.tfdbg_run_id)

    for i in range(100):
      source_file = debug_event_pb2.SourceFile(
          host_name="localhost", file_path="/tmp/file_%d.py" % i)
      source_file.lines.append("# File %d" % i)
      writer.WriteSourceFile(source_file)
    writer.FlushNonExecutionFiles()

    reader = debug_events_reader.DebugDataReader(self.dump_root)
    reader.update()
    lines = [None] * 100
    def read_job_1():
      # Read in the reverse order to enhance randomness of the read access.
      for i in range(49, -1, -1):
        lines[i] = reader.source_lines("localhost", "/tmp/file_%d.py" % i)
    def read_job_2():
      for i in range(99, 49, -1):
        lines[i] = reader.source_lines("localhost", "/tmp/file_%d.py" % i)
    thread_1 = threading.Thread(target=read_job_1)
    thread_2 = threading.Thread(target=read_job_2)
    thread_1.start()
    thread_2.start()
    thread_1.join()
    thread_2.join()
    for i in range(100):
      self.assertEqual(lines[i], ["# File %d" % i])
  def testReadingTwoFileSetsWithTheDifferentRootsLeadsToError(self):
    # To simulate a multi-host data dump, we first generate file sets in two
    # different directories, with different tfdbg_run_ids, and then combine
    # them.
    for i in range(2):
      writer = debug_events_writer.DebugEventsWriter(
          os.path.join(self.dump_root, str(i)),
          "run_id_%d" % i,
          circular_buffer_size=-1)
      writer.FlushNonExecutionFiles()
      writer.FlushExecutionFiles()

    # Move all files from the subdirectory /1 to subdirectory /0.
    dump_root_0 = os.path.join(self.dump_root, "0")
    src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
    for src_path in src_paths:
      dst_path = os.path.join(
          dump_root_0,
          # Rename the file set to avoid file name collision.
          re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path)))
      os.rename(src_path, dst_path)

    with self.assertRaisesRegexp(ValueError,
                                 r"Found multiple \(2\) tfdbg2 runs"):
      debug_events_reader.DebugDataReader(dump_root_0)
  def testConcurrentExecutionRandomReads(self):
    circular_buffer_size = -1
    writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                   self.tfdbg_run_id,
                                                   circular_buffer_size)

    for i in range(100):
      execution = debug_event_pb2.Execution()
      execution.op_type = "OpType%d" % i
      writer.WriteExecution(execution)
    writer.FlushNonExecutionFiles()
    writer.FlushExecutionFiles()

    reader = debug_events_reader.DebugDataReader(self.dump_root)
    reader.update()
    executions = [None] * 100
    def read_job_1():
      execution_digests = reader.executions(digest=True)
      # Read in the reverse order to enhance randomness of the read access.
      for i in range(49, -1, -1):
        execution = reader.read_execution(execution_digests[i])
        executions[i] = execution
    def read_job_2():
      execution_digests = reader.executions(digest=True)
      for i in range(99, 49, -1):
        execution = reader.read_execution(execution_digests[i])
        executions[i] = execution
    thread_1 = threading.Thread(target=read_job_1)
    thread_2 = threading.Thread(target=read_job_2)
    thread_1.start()
    thread_2.start()
    thread_1.join()
    thread_2.join()
    for i in range(100):
      self.assertEqual(executions[i].op_type, "OpType%d" % i)
예제 #8
0
    def testConcurrentWritesToExecutionFiles(self):
        circular_buffer_size = 5
        writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                       self.tfdbg_run_id,
                                                       circular_buffer_size)
        debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                       graph_name="graph1")
        writer.WriteDebuggedGraph(debugged_graph)

        execution_state = {"counter": 0, "lock": threading.Lock()}

        def write_execution():
            execution = debug_event_pb2.Execution()
            with execution_state["lock"]:
                execution.op_type = "OpType%d" % execution_state["counter"]
                execution_state["counter"] += 1
            writer.WriteExecution(execution)

        graph_execution_trace_state = {"counter": 0, "lock": threading.Lock()}

        def write_graph_execution_trace():
            with graph_execution_trace_state["lock"]:
                op_name = "Op%d" % graph_execution_trace_state["counter"]
                graph_op_creation = debug_event_pb2.GraphOpCreation(
                    op_type="FooOp", op_name=op_name, graph_id="graph1")
                trace = debug_event_pb2.GraphExecutionTrace(
                    op_name=op_name, tfdbg_context_id="graph1")
                graph_execution_trace_state["counter"] += 1
            writer.WriteGraphOpCreation(graph_op_creation)
            writer.WriteGraphExecutionTrace(trace)

        threads = []
        for i in range(circular_buffer_size * 4):
            if i % 2 == 0:
                target = write_execution
            else:
                target = write_graph_execution_trace
            thread = threading.Thread(target=target)
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()
        writer.FlushNonExecutionFiles()
        writer.FlushExecutionFiles()

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:
            reader.update()
            # Verify the content of the .execution file.
            executions = reader.executions()
            executed_op_types = [execution.op_type for execution in executions]
            self.assertLen(executed_op_types, circular_buffer_size)
            self.assertLen(executed_op_types, len(set(executed_op_types)))

            # Verify the content of the .graph_execution_traces file.
            op_names = [
                trace.op_name for trace in reader.graph_execution_traces()
            ]
            self.assertLen(op_names, circular_buffer_size)
            self.assertLen(op_names, len(set(op_names)))
 def testWriteAndReadMetadata(self):
     t0 = time.time()
     writer = debug_events_writer.DebugEventsWriter(self.dump_root)
     writer.Close()
     with debug_events_reader.DebugDataReader(self.dump_root) as reader:
         self.assertIsInstance(reader.starting_wall_time(), float)
         self.assertGreaterEqual(reader.starting_wall_time(), t0)
         self.assertEqual(reader.tensorflow_version(), versions.__version__)
예제 #10
0
    def Runs(self):
        """Return all the run names in the `EventMultiplexer`.

        The `Run()` method of this class is specialized for the tfdbg2-format
        DebugEvent files. It only returns runs

        Returns:
        If tfdbg2-format data exists in the `logdir` of this object, returns:
            ```
            {runName: { "debugger-v2": [tag1, tag2, tag3] } }
            ```
            where `runName` is the hard-coded string `DEFAULT_DEBUGGER_RUN_NAME`
            string. This is related to the fact that tfdbg2 currently contains
            at most one DebugEvent file set per directory.
        If no tfdbg2-format data exists in the `logdir`, an empty `dict`.
        """
        if self._reader is None:
            try:
                from tensorflow.python.debug.lib import debug_events_reader
                from tensorflow.python.debug.lib import debug_events_monitors

                self._reader = debug_events_reader.DebugDataReader(
                    self._logdir)
                self._monitors = [
                    debug_events_monitors.InfNanMonitor(
                        self._reader, limit=DEFAULT_PER_TYPE_ALERT_LIMIT)
                ]
                # NOTE(cais): Currently each logdir is enforced to have only one
                # DebugEvent file set. So we add hard-coded default run name.
                run_in_background(self._reader.update)
                # TODO(cais): Start off a reading thread here, instead of being
                # called only once here.
            except ImportError:
                # This ensures graceful behavior when tensorflow install is
                # unavailable.
                return {}
            except AttributeError:
                # Gracefully fail for users without the required API changes to
                # debug_events_reader.DebugDataReader introduced in
                # TF 2.1.0.dev20200103. This should be safe to remove when
                # TF 2.2 is released.
                return {}
            except ValueError:
                # When no DebugEvent file set is found in the logdir, a
                # `ValueError` is thrown.
                return {}

        return {
            DEFAULT_DEBUGGER_RUN_NAME: {
                # TODO(cais): Add the semantically meaningful tag names such as
                # 'execution_digests_book', 'alerts_book'
                "debugger-v2": []
            }
        }
예제 #11
0
    def testReadingTwoFileSetsWithTheSameDumpRootSucceeds(self):
        # To simulate a multi-host data dump, we first generate file sets in two
        # different directories, with the same tfdbg_run_id, and then combine them.
        tfdbg_run_id = "foo"
        for i in range(2):
            writer = debug_events_writer.DebugEventsWriter(
                os.path.join(self.dump_root, str(i)),
                tfdbg_run_id,
                circular_buffer_size=-1)
            if i == 0:
                debugged_graph = debug_event_pb2.DebuggedGraph(
                    graph_id="graph1", graph_name="graph1")
                writer.WriteDebuggedGraph(debugged_graph)
                op_name = "Op_0"
                graph_op_creation = debug_event_pb2.GraphOpCreation(
                    op_type="FooOp", op_name=op_name, graph_id="graph1")
                writer.WriteGraphOpCreation(graph_op_creation)
                op_name = "Op_1"
                graph_op_creation = debug_event_pb2.GraphOpCreation(
                    op_type="FooOp", op_name=op_name, graph_id="graph1")
                writer.WriteGraphOpCreation(graph_op_creation)
            for _ in range(10):
                trace = debug_event_pb2.GraphExecutionTrace(
                    op_name="Op_%d" % i, tfdbg_context_id="graph1")
                writer.WriteGraphExecutionTrace(trace)
                writer.FlushNonExecutionFiles()
                writer.FlushExecutionFiles()

        # Move all files from the subdirectory /1 to subdirectory /0.
        dump_root_0 = os.path.join(self.dump_root, "0")
        src_paths = glob.glob(os.path.join(self.dump_root, "1", "*"))
        for src_path in src_paths:
            dst_path = os.path.join(
                dump_root_0,
                # Rename the file set to avoid file name collision.
                re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1",
                       os.path.basename(src_path)))
            os.rename(src_path, dst_path)

        with debug_events_reader.DebugDataReader(dump_root_0) as reader:
            reader.update()
            # Verify the content of the .graph_execution_traces file.
            trace_digests = reader.graph_execution_traces(digest=True)
            self.assertLen(trace_digests, 20)
            for _ in range(10):
                trace = reader.read_graph_execution_trace(trace_digests[i])
                self.assertEqual(trace.op_name, "Op_0")
            for _ in range(10):
                trace = reader.read_graph_execution_trace(trace_digests[i +
                                                                        10])
                self.assertEqual(trace.op_name, "Op_1")
    def _tryCreateReader(self):
        """Try creating reader for tfdbg2 data in the logdir.

        If the reader has already been created, a new one will not be created and
        this function is a no-op.

        If a reader has not been created, create it and start periodic calls to
        `update()` on a separate thread.
        """
        if self._reader:
            return
        with self._reader_lock:
            if not self._reader:
                try:
                    # TODO(cais): Avoid conditional imports and instead use
                    # plugin loader to gate the loading of this entire plugin.
                    from tensorflow.python.debug.lib import debug_events_reader
                    from tensorflow.python.debug.lib import (
                        debug_events_monitors,
                    )
                except ImportError:
                    # This ensures graceful behavior when tensorflow install is
                    # unavailable or when the installed tensorflow version does not
                    # contain the required modules.
                    return

                try:
                    self._reader = debug_events_reader.DebugDataReader(
                        self._logdir
                    )
                except AttributeError:
                    # Gracefully fail for users without the required API changes to
                    # debug_events_reader.DebugDataReader introduced in
                    # TF 2.1.0.dev20200103. This should be safe to remove when
                    # TF 2.2 is released.
                    return
                except ValueError:
                    # When no DebugEvent file set is found in the logdir, a
                    # `ValueError` is thrown.
                    return

                self._monitors = [
                    debug_events_monitors.InfNanMonitor(
                        self._reader, limit=DEFAULT_PER_TYPE_ALERT_LIMIT
                    )
                ]
                self._reload_needed_event = run_repeatedly_in_background(
                    self._reader.update, DEFAULT_RELOAD_INTERVAL_SEC
                )
  def testRangeReadingExecutions(self, begin, end, expected_begin,
                                 expected_end):
    writer = debug_events_writer.DebugEventsWriter(
        self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1)
    for i in range(5):
      execution = debug_event_pb2.Execution(op_type="OpType%d" % i)
      writer.WriteExecution(execution)
    writer.FlushExecutionFiles()
    writer.Close()

    with debug_events_reader.DebugDataReader(self.dump_root) as reader:
      reader.update()
      executions = reader.executions(begin=begin, end=end)
    self.assertLen(executions, expected_end - expected_begin)
    self.assertEqual(executions[0].op_type, "OpType%d" % expected_begin)
    self.assertEqual(executions[-1].op_type, "OpType%d" % (expected_end - 1))
예제 #14
0
    def testWriteExecutionEventsWithoutCircularBufferBehavior(self):
        # A circular buffer size of 0 abolishes the circular buffer behavior.
        writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0)
        num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
        for i in range(num_execution_events):
            execution = debug_event_pb2.Execution()
            execution.op_type = "OpType%d" % i
            writer.WriteExecution(execution)
        writer.FlushExecutionFiles()

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:
            reader.update()
            executions = reader.executions()
            self.assertLen(executions, num_execution_events)
            for i, execution in enumerate(executions):
                self.assertEqual(execution.op_type, "OpType%d" % i)
예제 #15
0
    def testConcurrentGraphExecutionTraceUpdateAndRandomRead(self):
        circular_buffer_size = -1
        writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                       self.tfdbg_run_id,
                                                       circular_buffer_size)
        debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                       graph_name="graph1")
        writer.WriteDebuggedGraph(debugged_graph)

        writer_state = {"counter": 0, "done": False}

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:

            def write_and_update_job():
                while True:
                    if writer_state["done"]:
                        break
                    op_name = "Op%d" % writer_state["counter"]
                    graph_op_creation = debug_event_pb2.GraphOpCreation(
                        op_type="FooOp", op_name=op_name, graph_id="graph1")
                    writer.WriteGraphOpCreation(graph_op_creation)
                    trace = debug_event_pb2.GraphExecutionTrace(
                        op_name=op_name, tfdbg_context_id="graph1")
                    writer.WriteGraphExecutionTrace(trace)
                    writer_state["counter"] += 1
                    writer.FlushNonExecutionFiles()
                    writer.FlushExecutionFiles()
                    reader.update()

            # On the sub-thread, keep writing and reading new GraphExecutionTraces.
            write_and_update_thread = threading.Thread(
                target=write_and_update_job)
            write_and_update_thread.start()
            # On the main thread, do concurrent random read.
            while True:
                digests = reader.graph_execution_traces(digest=True)
                if digests:
                    trace_0 = reader.read_graph_execution_trace(digests[0])
                    self.assertEqual(trace_0.op_name, "Op0")
                    writer_state["done"] = True
                    break
                else:
                    time.sleep(0.1)
                    continue
            write_and_update_thread.join()
예제 #16
0
    def testConcurrentGraphExecutionTraceRandomReads(self):
        circular_buffer_size = -1
        writer = debug_events_writer.DebugEventsWriter(self.dump_root,
                                                       self.tfdbg_run_id,
                                                       circular_buffer_size)
        debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1",
                                                       graph_name="graph1")
        writer.WriteDebuggedGraph(debugged_graph)

        for i in range(100):
            op_name = "Op%d" % i
            graph_op_creation = debug_event_pb2.GraphOpCreation(
                op_type="FooOp", op_name=op_name, graph_id="graph1")
            writer.WriteGraphOpCreation(graph_op_creation)
            trace = debug_event_pb2.GraphExecutionTrace(
                op_name=op_name, tfdbg_context_id="graph1")
            writer.WriteGraphExecutionTrace(trace)
        writer.FlushNonExecutionFiles()
        writer.FlushExecutionFiles()

        reader = debug_events_reader.DebugDataReader(self.dump_root)
        reader.update()
        traces = [None] * 100

        def read_job_1():
            digests = reader.graph_execution_traces(digest=True)
            for i in range(49, -1, -1):
                traces[i] = reader.read_graph_execution_trace(digests[i])

        def read_job_2():
            digests = reader.graph_execution_traces(digest=True)
            for i in range(99, 49, -1):
                traces[i] = reader.read_graph_execution_trace(digests[i])

        thread_1 = threading.Thread(target=read_job_1)
        thread_2 = threading.Thread(target=read_job_2)
        thread_1.start()
        thread_2.start()
        thread_1.join()
        thread_2.join()
        for i in range(100):
            self.assertEqual(traces[i].op_name, "Op%d" % i)
예제 #17
0
    def testWriteExecutionEventsWithCircularBuffer(self):
        writer = debug_events_writer.DebugEventsWriter(self.dump_root)
        num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2
        for i in range(num_execution_events):
            execution = debug_event_pb2.Execution()
            execution.op_type = "OpType%d" % i
            writer.WriteExecution(execution)

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:
            # Before FlushExecutionFiles() is called. No data should have been written
            # to the file.
            reader.update()
            self.assertFalse(reader.executions())

            writer.FlushExecutionFiles()
            reader.update()
            executions = reader.executions()
            for i, execution in enumerate(executions):
                self.assertEqual(
                    execution.op_type, "OpType%d" %
                    (i + debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE))
    def testOnGraphExecutionTraceIsCalled(self, tensor_debug_mode):
        xs = constant_op.constant([2., 6., 8., 1., 2.], dtype=dtypes.float32)
        writer = dumping_callback.enable_dump_debug_info(
            self.dump_root, tensor_debug_mode=tensor_debug_mode)

        @def_function.function
        def unique_sum(xs):
            """Sum over the unique values, for testing."""
            unique_xs, indices = array_ops.unique(xs)
            return math_ops.reduce_sum(unique_xs), indices

        unique_sum(xs)
        writer.FlushNonExecutionFiles()
        writer.FlushExecutionFiles()

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:
            test_monitor = TestMonitor(reader)
            reader.update()
            self.assertLen(test_monitor.executions, 1)

            execution = test_monitor.executions[0]
            self.assertTrue(execution.wall_time)
            self.assertStartsWith(execution.op_type, "__inference_unique_sum")
            self.assertLen(execution.output_tensor_device_ids, 2)
            self.assertLen(execution.input_tensor_ids, 1)
            self.assertLen(execution.output_tensor_ids, 2)
            self.assertEqual(execution.num_outputs, 2)
            self.assertTrue(execution.graph_id)

            traces = test_monitor.graph_execution_traces
            if tensor_debug_mode == "CONCISE_HEALTH":
                self.assertLen(traces, 3)  # [Placeholder:0, Unique:0 , Sum:0].
                self.assertEqual(traces[0].op_type, "Placeholder")
                self.assertEqual(traces[0].output_slot, 0)
                self.assertEqual(traces[1].op_type, "Unique")
                self.assertEqual(traces[1].output_slot, 0)
                # Unique:1 is not traced under CONCISE_HEALTH mode, as it's int-dtype.
                self.assertEqual(traces[2].op_type, "Sum")
                self.assertEqual(traces[2].output_slot, 0)
                # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count].
                self.assertLen(traces[0].debug_tensor_value, 5)
                self.assertLen(traces[1].debug_tensor_value, 5)
                self.assertLen(traces[2].debug_tensor_value, 5)
            elif tensor_debug_mode == "FULL_HEALTH":
                self.assertLen(traces, 3)  # [Placeholder:0, Unique:0 , Sum:0].
                self.assertEqual(traces[0].op_type, "Placeholder")
                self.assertEqual(traces[0].output_slot, 0)
                self.assertEqual(traces[1].op_type, "Unique")
                self.assertEqual(traces[1].output_slot, 0)
                # Unique:1 is not traced under FULL_HEALTH mode, as it's int-dtype.
                self.assertEqual(traces[2].op_type, "Sum")
                self.assertEqual(traces[2].output_slot, 0)
                # [tensor_id, device_id, dtype, rank, element_count,
                #  neg_inf_count, pos_inf_count, nan_count,
                #  neg_finite_count, zero_count, pos_finite_count].
                self.assertLen(traces[0].debug_tensor_value, 11)
                self.assertLen(traces[1].debug_tensor_value, 11)
                self.assertLen(traces[2].debug_tensor_value, 11)
            elif tensor_debug_mode == "FULL_TENSOR":
                # [Placeholder:0, Unique:0, Unique:1, Const:0, Sum:0].
                self.assertLen(traces, 5)
                self.assertEqual(traces[0].op_type, "Placeholder")
                self.assertEqual(traces[0].output_slot, 0)
                self.assertIsNone(traces[0].debug_tensor_value)
                self.assertAllEqual(
                    reader.graph_execution_trace_to_tensor_value(traces[0]),
                    [2., 6., 8., 1., 2.])
                self.assertEqual(traces[1].op_type, "Unique")
                self.assertEqual(traces[1].output_slot, 0)
                self.assertIsNone(traces[1].debug_tensor_value)
                self.assertAllEqual(
                    reader.graph_execution_trace_to_tensor_value(traces[1]),
                    [2., 6., 8., 1.])
                self.assertEqual(traces[2].op_type, "Unique")
                self.assertEqual(traces[2].output_slot, 1)
                self.assertIsNone(traces[2].debug_tensor_value)
                self.assertAllEqual(
                    reader.graph_execution_trace_to_tensor_value(traces[2]),
                    [0, 1, 2, 3, 0])
                self.assertEqual(traces[3].op_type, "Const")
                self.assertEqual(traces[3].output_slot, 0)
                self.assertIsNone(traces[3].debug_tensor_value)
                self.assertAllClose(
                    reader.graph_execution_trace_to_tensor_value(traces[3]),
                    [0])
                self.assertEqual(traces[4].op_type, "Sum")
                self.assertEqual(traces[4].output_slot, 0)
                self.assertIsNone(traces[4].debug_tensor_value)
                self.assertAllClose(
                    reader.graph_execution_trace_to_tensor_value(traces[4]),
                    17.)
예제 #19
0
    def testKerasModelFitOnOneOrTwoDevices(self, distribution,
                                           tensor_debug_mode):
        writer = dumping_callback.enable_dump_debug_info(
            self.dump_root, tensor_debug_mode=tensor_debug_mode)

        with distribution.scope():
            model = keras.Sequential()
            model.add(
                keras.layers.Dense(units=10,
                                   input_shape=[5],
                                   activation="relu"))
            model.add(keras.layers.Dense(units=1))
            model.compile(loss="mse", optimizer="sgd")

            batch_size = 20
            x = np.ones([batch_size, 5])
            y = np.ones([batch_size, 1])
            epochs = 1
            history = model.fit(x, y, epochs=epochs, verbose=0)
            self.assertLen(history.history["loss"], epochs)

            writer.FlushNonExecutionFiles()
            writer.FlushExecutionFiles()

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:
            reader.update()
            executions = reader.executions()
            fit_executions = [
                execution.op_type for execution in executions
                if "_distributed_function" in execution.op_type
            ]
            self.assertLen(fit_executions, epochs)

            traces = reader.graph_execution_traces()
            num_devices = len(distribution.extended.worker_devices)
            device_name_0 = distribution.extended.worker_devices[0]
            if num_devices > 1:
                device_name_1 = distribution.extended.worker_devices[1]
            device_0_executed_op_types = [
                trace.op_type for trace in traces
                if trace.device_name.endswith(device_name_0)
            ]
            if num_devices > 1:
                device_1_executed_op_types = [
                    trace.op_type for trace in traces
                    if trace.device_name.endswith(device_name_1)
                ]

            self.assertIn("MatMul", device_0_executed_op_types)
            self.assertIn("BiasAdd", device_0_executed_op_types)
            self.assertIn("Relu", device_0_executed_op_types)
            self.assertIn("ReluGrad", device_0_executed_op_types)
            if num_devices > 1:
                # If there are two devices involved, assert the ops inside tf.functions
                # are executed and recorded for the equal numbers of times by the
                # dumping op-callback.
                self.assertEqual(device_0_executed_op_types.count("MatMul"),
                                 device_1_executed_op_types.count("MatMul"))
                self.assertEqual(device_0_executed_op_types.count("BiasAdd"),
                                 device_1_executed_op_types.count("BiasAdd"))
                self.assertEqual(device_0_executed_op_types.count("Relu"),
                                 device_1_executed_op_types.count("Relu"))
                self.assertEqual(device_0_executed_op_types.count("ReluGrad"),
                                 device_1_executed_op_types.count("ReluGrad"))

            if tensor_debug_mode == "NO_TENSOR":
                for trace in traces:
                    self.assertEqual(trace.debug_tensor_value, [])
            elif tensor_debug_mode == "FULL_TENSOR":
                gpu_0_relu_values = [
                    reader.graph_execution_trace_to_tensor_value(trace)
                    for trace in traces if trace.op_type == "Relu"
                    and trace.device_name.endswith(device_name_0)
                ]
                self.assertTrue(gpu_0_relu_values)
                gpu_0_relu_grad_values = [
                    reader.graph_execution_trace_to_tensor_value(trace)
                    for trace in traces if trace.op_type == "ReluGrad"
                    and trace.device_name.endswith(device_name_0)
                ]
                self.assertTrue(gpu_0_relu_grad_values)
                if num_devices > 1:
                    gpu_1_relu_values = [
                        reader.graph_execution_trace_to_tensor_value(trace)
                        for trace in traces if trace.op_type == "Relu"
                        and trace.device_name.endswith(device_name_1)
                    ]
                    self.assertTrue(gpu_1_relu_values)
                    for i in range(len(gpu_0_relu_values)):
                        self.assertEqual(gpu_0_relu_values[i].shape,
                                         gpu_1_relu_values[i].shape)
                    gpu_1_relu_grad_values = [
                        reader.graph_execution_trace_to_tensor_value(trace)
                        for trace in traces if trace.op_type == "ReluGrad"
                        and trace.device_name.endswith(device_name_1)
                    ]
                    self.assertTrue(gpu_1_relu_grad_values)
                    for i in range(len(gpu_0_relu_grad_values)):
                        self.assertEqual(gpu_0_relu_grad_values[i].shape,
                                         gpu_1_relu_grad_values[i].shape)
예제 #20
0
    def testDumpingMiniModel(self, distribution, tensor_debug_mode):
        with distribution.scope():
            writer = dumping_callback.enable_dump_debug_info(
                self.dump_root, tensor_debug_mode=tensor_debug_mode)

            mini_model = MiniModel()
            optimizer = gradient_descent.GradientDescentOptimizer(0.25)

            def train_step():
                with backprop.GradientTape() as tape:
                    loss = mini_model(array_ops.ones([1, 10]))
                    grads = tape.gradient(loss, mini_model.weights)
                    grads_and_vars = zip(grads, mini_model.weights)
                    optimizer.apply_gradients(grads_and_vars)

            distribution.experimental_run_v2(train_step)

            updated_var_values = self.evaluate(mini_model.variables)
            num_devices = len(distribution.extended.worker_devices)
            assert num_devices in (1, 2)
            if num_devices == 1:
                self.assertAllEqual(0.75 * np.ones([10, 1]),
                                    updated_var_values[0])
                self.assertAllEqual([0.75], updated_var_values[1])
            else:
                self.assertAllEqual(0.5 * np.ones([10, 1]),
                                    updated_var_values[0])
                self.assertAllEqual([0.5], updated_var_values[1])

            writer.FlushNonExecutionFiles()
            writer.FlushExecutionFiles()

        device_name_0 = distribution.extended.worker_devices[0]
        logging.info("device_name_0 = %s", device_name_0)
        if num_devices > 1:
            device_name_1 = distribution.extended.worker_devices[1]
            logging.info("device_name_1 = %s", device_name_1)

        with debug_events_reader.DebugDataReader(self.dump_root) as reader:
            reader.update()
            traces = reader.graph_execution_traces()

            # Verify graph-execution traces are available for both devices.
            # We don't assert MatMul occurs exactly once because the gradient of
            # MatMul involves MatMul.
            device_0_executed_op_types = [
                trace.op_type for trace in traces
                if trace.device_name.endswith(device_name_0)
            ]
            if num_devices > 1:
                device_1_executed_op_types = [
                    trace.op_type for trace in traces
                    if trace.device_name.endswith(device_name_1)
                ]
            self.assertIn("MatMul", device_0_executed_op_types)
            self.assertEqual(device_0_executed_op_types.count("BiasAdd"), 1)
            if num_devices > 1:
                self.assertIn("MatMul", device_1_executed_op_types)
                self.assertEqual(device_1_executed_op_types.count("BiasAdd"),
                                 1)

            if tensor_debug_mode == "NO_TENSOR":
                for trace in traces:
                    self.assertEqual(trace.debug_tensor_value, [])
            elif tensor_debug_mode == "FULL_TENSOR":
                device_0_matmul_values = [
                    reader.graph_execution_trace_to_tensor_value(trace)
                    for trace in traces if trace.op_type == "MatMul"
                    and trace.device_name.endswith(device_name_0)
                ]
                device_0_bias_add_values = [
                    reader.graph_execution_trace_to_tensor_value(trace)
                    for trace in traces if trace.op_type == "BiasAdd"
                    and trace.device_name.endswith(device_name_0)
                ]
                self.assertAllClose(device_0_matmul_values[0], [[10.0]])
                self.assertAllClose(device_0_bias_add_values[0], [[11.0]])
                if num_devices > 1:
                    device_1_matmul_values = [
                        reader.graph_execution_trace_to_tensor_value(trace)
                        for trace in traces if trace.op_type == "MatMul"
                        and trace.device_name.endswith(device_name_1)
                    ]
                    device_1_bias_add_values = [
                        reader.graph_execution_trace_to_tensor_value(trace)
                        for trace in traces if trace.op_type == "BiasAdd"
                        and trace.device_name.endswith(device_name_1)
                    ]
                    self.assertAllClose(device_1_matmul_values[0], [[10.0]])
                    self.assertAllClose(device_1_bias_add_values[0], [[11.0]])