def testConcurrentExecutionUpdateAndRandomRead(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id, circular_buffer_size) writer_state = {"counter": 0, "done": False} with debug_events_reader.DebugDataReader(self.dump_root) as reader: def write_and_update_job(): while True: if writer_state["done"]: break execution = debug_event_pb2.Execution() execution.op_type = "OpType%d" % writer_state["counter"] writer_state["counter"] += 1 writer.WriteExecution(execution) writer.FlushExecutionFiles() reader.update() # On the sub-thread, keep writing and reading new Execution protos. write_and_update_thread = threading.Thread(target=write_and_update_job) write_and_update_thread.start() # On the main thread, do concurrent random read. while True: exec_digests = reader.executions(digest=True) if exec_digests: exec_0 = reader.read_execution(exec_digests[0]) self.assertEqual(exec_0.op_type, "OpType0") writer_state["done"] = True break else: time.sleep(0.1) continue write_and_update_thread.join()
def testOnExecutionIsCalled(self, tensor_debug_mode): writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) x = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float32) y = constant_op.constant([[-1], [1]], dtype=dtypes.float32) math_ops.matmul(x, y) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() with debug_events_reader.DebugDataReader(self.dump_root) as reader: test_monitor = TestMonitor(reader) reader.update() self.assertLen(test_monitor.executions, 1) self.assertEmpty(test_monitor.graph_execution_traces) execution = test_monitor.executions[0] self.assertTrue(execution.wall_time) self.assertEqual(execution.op_type, "MatMul") self.assertLen(execution.output_tensor_device_ids, 1) self.assertLen(execution.input_tensor_ids, 2) self.assertLen(execution.output_tensor_ids, 1) self.assertEqual(execution.num_outputs, 1) self.assertEqual(execution.graph_id, "") if tensor_debug_mode == "NO_TENSOR": self.assertIsNone(execution.debug_tensor_values) elif tensor_debug_mode == "CONCISE_HEALTH": self.assertLen(execution.debug_tensor_values, 1) # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count]. self.assertLen(execution.debug_tensor_values[0], 5) elif tensor_debug_mode == "FULL_TENSOR": # Full tensor values are not stored in the debug_tensor_values field. self.assertIsNone(execution.debug_tensor_values) self.assertAllClose( reader.execution_to_tensor_values(execution), [[[1.], [1.]]])
def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin, expected_end): writer = debug_events_writer.DebugEventsWriter( self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1) debugged_graph = debug_event_pb2.DebuggedGraph( graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) for i in range(5): op_name = "Op_%d" % i graph_op_creation = debug_event_pb2.GraphOpCreation( op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() writer.Close() with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() traces = reader.graph_execution_traces(begin=begin, end=end) self.assertLen(traces, expected_end - expected_begin) self.assertEqual(traces[0].op_name, "Op_%d" % expected_begin) self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
def Runs(self): """Return all the run names in the `EventMultiplexer`. The `Run()` method of this class is specialized for the tfdbg2-format DebugEvent files. It only returns runs Returns: If tfdbg2-format data exists in the `logdir` of this object, returns: ``` {runName: { "debugger-v2": [tag1, tag2, tag3] } } ``` where `runName` is the hard-coded string `DEFAULT_DEBUGGER_RUN_NAME` string. This is related to the fact that tfdbg2 currently contains at most one DebugEvent file set per directory. If no tfdbg2-format data exists in the `logdir`, an empty `dict`. """ reader = None from tensorflow.python.debug.lib import debug_events_reader try: reader = debug_events_reader.DebugDataReader(self._logdir) # NOTE(cais): Currently each logdir is enforced to have only one # DebugEvent file set. So we add hard-coded default run name. except ValueError as error: # When no DebugEvent file set is found in the logdir, a `ValueError` # is thrown. return {} with reader: return { DEFAULT_DEBUGGER_RUN_NAME: { # TODO(cais): Add the semantically meaningful tag names such as # 'execution_digests_book', 'alerts_book' "debugger-v2": [] } }
def testConcurrentSourceFileRandomReads(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id) for i in range(100): source_file = debug_event_pb2.SourceFile( host_name="localhost", file_path="/tmp/file_%d.py" % i) source_file.lines.append("# File %d" % i) writer.WriteSourceFile(source_file) writer.FlushNonExecutionFiles() reader = debug_events_reader.DebugDataReader(self.dump_root) reader.update() lines = [None] * 100 def read_job_1(): # Read in the reverse order to enhance randomness of the read access. for i in range(49, -1, -1): lines[i] = reader.source_lines("localhost", "/tmp/file_%d.py" % i) def read_job_2(): for i in range(99, 49, -1): lines[i] = reader.source_lines("localhost", "/tmp/file_%d.py" % i) thread_1 = threading.Thread(target=read_job_1) thread_2 = threading.Thread(target=read_job_2) thread_1.start() thread_2.start() thread_1.join() thread_2.join() for i in range(100): self.assertEqual(lines[i], ["# File %d" % i])
def testReadingTwoFileSetsWithTheDifferentRootsLeadsToError(self): # To simulate a multi-host data dump, we first generate file sets in two # different directories, with different tfdbg_run_ids, and then combine # them. for i in range(2): writer = debug_events_writer.DebugEventsWriter( os.path.join(self.dump_root, str(i)), "run_id_%d" % i, circular_buffer_size=-1) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() # Move all files from the subdirectory /1 to subdirectory /0. dump_root_0 = os.path.join(self.dump_root, "0") src_paths = glob.glob(os.path.join(self.dump_root, "1", "*")) for src_path in src_paths: dst_path = os.path.join( dump_root_0, # Rename the file set to avoid file name collision. re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path))) os.rename(src_path, dst_path) with self.assertRaisesRegexp(ValueError, r"Found multiple \(2\) tfdbg2 runs"): debug_events_reader.DebugDataReader(dump_root_0)
def testConcurrentExecutionRandomReads(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id, circular_buffer_size) for i in range(100): execution = debug_event_pb2.Execution() execution.op_type = "OpType%d" % i writer.WriteExecution(execution) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() reader = debug_events_reader.DebugDataReader(self.dump_root) reader.update() executions = [None] * 100 def read_job_1(): execution_digests = reader.executions(digest=True) # Read in the reverse order to enhance randomness of the read access. for i in range(49, -1, -1): execution = reader.read_execution(execution_digests[i]) executions[i] = execution def read_job_2(): execution_digests = reader.executions(digest=True) for i in range(99, 49, -1): execution = reader.read_execution(execution_digests[i]) executions[i] = execution thread_1 = threading.Thread(target=read_job_1) thread_2 = threading.Thread(target=read_job_2) thread_1.start() thread_2.start() thread_1.join() thread_2.join() for i in range(100): self.assertEqual(executions[i].op_type, "OpType%d" % i)
def testConcurrentWritesToExecutionFiles(self): circular_buffer_size = 5 writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) execution_state = {"counter": 0, "lock": threading.Lock()} def write_execution(): execution = debug_event_pb2.Execution() with execution_state["lock"]: execution.op_type = "OpType%d" % execution_state["counter"] execution_state["counter"] += 1 writer.WriteExecution(execution) graph_execution_trace_state = {"counter": 0, "lock": threading.Lock()} def write_graph_execution_trace(): with graph_execution_trace_state["lock"]: op_name = "Op%d" % graph_execution_trace_state["counter"] graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") graph_execution_trace_state["counter"] += 1 writer.WriteGraphOpCreation(graph_op_creation) writer.WriteGraphExecutionTrace(trace) threads = [] for i in range(circular_buffer_size * 4): if i % 2 == 0: target = write_execution else: target = write_graph_execution_trace thread = threading.Thread(target=target) thread.start() threads.append(thread) for thread in threads: thread.join() writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() # Verify the content of the .execution file. executions = reader.executions() executed_op_types = [execution.op_type for execution in executions] self.assertLen(executed_op_types, circular_buffer_size) self.assertLen(executed_op_types, len(set(executed_op_types))) # Verify the content of the .graph_execution_traces file. op_names = [ trace.op_name for trace in reader.graph_execution_traces() ] self.assertLen(op_names, circular_buffer_size) self.assertLen(op_names, len(set(op_names)))
def testWriteAndReadMetadata(self): t0 = time.time() writer = debug_events_writer.DebugEventsWriter(self.dump_root) writer.Close() with debug_events_reader.DebugDataReader(self.dump_root) as reader: self.assertIsInstance(reader.starting_wall_time(), float) self.assertGreaterEqual(reader.starting_wall_time(), t0) self.assertEqual(reader.tensorflow_version(), versions.__version__)
def Runs(self): """Return all the run names in the `EventMultiplexer`. The `Run()` method of this class is specialized for the tfdbg2-format DebugEvent files. It only returns runs Returns: If tfdbg2-format data exists in the `logdir` of this object, returns: ``` {runName: { "debugger-v2": [tag1, tag2, tag3] } } ``` where `runName` is the hard-coded string `DEFAULT_DEBUGGER_RUN_NAME` string. This is related to the fact that tfdbg2 currently contains at most one DebugEvent file set per directory. If no tfdbg2-format data exists in the `logdir`, an empty `dict`. """ if self._reader is None: try: from tensorflow.python.debug.lib import debug_events_reader from tensorflow.python.debug.lib import debug_events_monitors self._reader = debug_events_reader.DebugDataReader( self._logdir) self._monitors = [ debug_events_monitors.InfNanMonitor( self._reader, limit=DEFAULT_PER_TYPE_ALERT_LIMIT) ] # NOTE(cais): Currently each logdir is enforced to have only one # DebugEvent file set. So we add hard-coded default run name. run_in_background(self._reader.update) # TODO(cais): Start off a reading thread here, instead of being # called only once here. except ImportError: # This ensures graceful behavior when tensorflow install is # unavailable. return {} except AttributeError: # Gracefully fail for users without the required API changes to # debug_events_reader.DebugDataReader introduced in # TF 2.1.0.dev20200103. This should be safe to remove when # TF 2.2 is released. return {} except ValueError: # When no DebugEvent file set is found in the logdir, a # `ValueError` is thrown. return {} return { DEFAULT_DEBUGGER_RUN_NAME: { # TODO(cais): Add the semantically meaningful tag names such as # 'execution_digests_book', 'alerts_book' "debugger-v2": [] } }
def testReadingTwoFileSetsWithTheSameDumpRootSucceeds(self): # To simulate a multi-host data dump, we first generate file sets in two # different directories, with the same tfdbg_run_id, and then combine them. tfdbg_run_id = "foo" for i in range(2): writer = debug_events_writer.DebugEventsWriter( os.path.join(self.dump_root, str(i)), tfdbg_run_id, circular_buffer_size=-1) if i == 0: debugged_graph = debug_event_pb2.DebuggedGraph( graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) op_name = "Op_0" graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) op_name = "Op_1" graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) for _ in range(10): trace = debug_event_pb2.GraphExecutionTrace( op_name="Op_%d" % i, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() # Move all files from the subdirectory /1 to subdirectory /0. dump_root_0 = os.path.join(self.dump_root, "0") src_paths = glob.glob(os.path.join(self.dump_root, "1", "*")) for src_path in src_paths: dst_path = os.path.join( dump_root_0, # Rename the file set to avoid file name collision. re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path))) os.rename(src_path, dst_path) with debug_events_reader.DebugDataReader(dump_root_0) as reader: reader.update() # Verify the content of the .graph_execution_traces file. trace_digests = reader.graph_execution_traces(digest=True) self.assertLen(trace_digests, 20) for _ in range(10): trace = reader.read_graph_execution_trace(trace_digests[i]) self.assertEqual(trace.op_name, "Op_0") for _ in range(10): trace = reader.read_graph_execution_trace(trace_digests[i + 10]) self.assertEqual(trace.op_name, "Op_1")
def _tryCreateReader(self): """Try creating reader for tfdbg2 data in the logdir. If the reader has already been created, a new one will not be created and this function is a no-op. If a reader has not been created, create it and start periodic calls to `update()` on a separate thread. """ if self._reader: return with self._reader_lock: if not self._reader: try: # TODO(cais): Avoid conditional imports and instead use # plugin loader to gate the loading of this entire plugin. from tensorflow.python.debug.lib import debug_events_reader from tensorflow.python.debug.lib import ( debug_events_monitors, ) except ImportError: # This ensures graceful behavior when tensorflow install is # unavailable or when the installed tensorflow version does not # contain the required modules. return try: self._reader = debug_events_reader.DebugDataReader( self._logdir ) except AttributeError: # Gracefully fail for users without the required API changes to # debug_events_reader.DebugDataReader introduced in # TF 2.1.0.dev20200103. This should be safe to remove when # TF 2.2 is released. return except ValueError: # When no DebugEvent file set is found in the logdir, a # `ValueError` is thrown. return self._monitors = [ debug_events_monitors.InfNanMonitor( self._reader, limit=DEFAULT_PER_TYPE_ALERT_LIMIT ) ] self._reload_needed_event = run_repeatedly_in_background( self._reader.update, DEFAULT_RELOAD_INTERVAL_SEC )
def testRangeReadingExecutions(self, begin, end, expected_begin, expected_end): writer = debug_events_writer.DebugEventsWriter( self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1) for i in range(5): execution = debug_event_pb2.Execution(op_type="OpType%d" % i) writer.WriteExecution(execution) writer.FlushExecutionFiles() writer.Close() with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() executions = reader.executions(begin=begin, end=end) self.assertLen(executions, expected_end - expected_begin) self.assertEqual(executions[0].op_type, "OpType%d" % expected_begin) self.assertEqual(executions[-1].op_type, "OpType%d" % (expected_end - 1))
def testWriteExecutionEventsWithoutCircularBufferBehavior(self): # A circular buffer size of 0 abolishes the circular buffer behavior. writer = debug_events_writer.DebugEventsWriter(self.dump_root, 0) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): execution = debug_event_pb2.Execution() execution.op_type = "OpType%d" % i writer.WriteExecution(execution) writer.FlushExecutionFiles() with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() executions = reader.executions() self.assertLen(executions, num_execution_events) for i, execution in enumerate(executions): self.assertEqual(execution.op_type, "OpType%d" % i)
def testConcurrentGraphExecutionTraceUpdateAndRandomRead(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) writer_state = {"counter": 0, "done": False} with debug_events_reader.DebugDataReader(self.dump_root) as reader: def write_and_update_job(): while True: if writer_state["done"]: break op_name = "Op%d" % writer_state["counter"] graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer_state["counter"] += 1 writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() reader.update() # On the sub-thread, keep writing and reading new GraphExecutionTraces. write_and_update_thread = threading.Thread( target=write_and_update_job) write_and_update_thread.start() # On the main thread, do concurrent random read. while True: digests = reader.graph_execution_traces(digest=True) if digests: trace_0 = reader.read_graph_execution_trace(digests[0]) self.assertEqual(trace_0.op_name, "Op0") writer_state["done"] = True break else: time.sleep(0.1) continue write_and_update_thread.join()
def testConcurrentGraphExecutionTraceRandomReads(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) for i in range(100): op_name = "Op%d" % i graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() reader = debug_events_reader.DebugDataReader(self.dump_root) reader.update() traces = [None] * 100 def read_job_1(): digests = reader.graph_execution_traces(digest=True) for i in range(49, -1, -1): traces[i] = reader.read_graph_execution_trace(digests[i]) def read_job_2(): digests = reader.graph_execution_traces(digest=True) for i in range(99, 49, -1): traces[i] = reader.read_graph_execution_trace(digests[i]) thread_1 = threading.Thread(target=read_job_1) thread_2 = threading.Thread(target=read_job_2) thread_1.start() thread_2.start() thread_1.join() thread_2.join() for i in range(100): self.assertEqual(traces[i].op_name, "Op%d" % i)
def testWriteExecutionEventsWithCircularBuffer(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root) num_execution_events = debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE * 2 for i in range(num_execution_events): execution = debug_event_pb2.Execution() execution.op_type = "OpType%d" % i writer.WriteExecution(execution) with debug_events_reader.DebugDataReader(self.dump_root) as reader: # Before FlushExecutionFiles() is called. No data should have been written # to the file. reader.update() self.assertFalse(reader.executions()) writer.FlushExecutionFiles() reader.update() executions = reader.executions() for i, execution in enumerate(executions): self.assertEqual( execution.op_type, "OpType%d" % (i + debug_events_writer.DEFAULT_CIRCULAR_BUFFER_SIZE))
def testOnGraphExecutionTraceIsCalled(self, tensor_debug_mode): xs = constant_op.constant([2., 6., 8., 1., 2.], dtype=dtypes.float32) writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) @def_function.function def unique_sum(xs): """Sum over the unique values, for testing.""" unique_xs, indices = array_ops.unique(xs) return math_ops.reduce_sum(unique_xs), indices unique_sum(xs) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() with debug_events_reader.DebugDataReader(self.dump_root) as reader: test_monitor = TestMonitor(reader) reader.update() self.assertLen(test_monitor.executions, 1) execution = test_monitor.executions[0] self.assertTrue(execution.wall_time) self.assertStartsWith(execution.op_type, "__inference_unique_sum") self.assertLen(execution.output_tensor_device_ids, 2) self.assertLen(execution.input_tensor_ids, 1) self.assertLen(execution.output_tensor_ids, 2) self.assertEqual(execution.num_outputs, 2) self.assertTrue(execution.graph_id) traces = test_monitor.graph_execution_traces if tensor_debug_mode == "CONCISE_HEALTH": self.assertLen(traces, 3) # [Placeholder:0, Unique:0 , Sum:0]. self.assertEqual(traces[0].op_type, "Placeholder") self.assertEqual(traces[0].output_slot, 0) self.assertEqual(traces[1].op_type, "Unique") self.assertEqual(traces[1].output_slot, 0) # Unique:1 is not traced under CONCISE_HEALTH mode, as it's int-dtype. self.assertEqual(traces[2].op_type, "Sum") self.assertEqual(traces[2].output_slot, 0) # [tensor_id, element_count, neg_inf_count, pos_inf_count, nan_count]. self.assertLen(traces[0].debug_tensor_value, 5) self.assertLen(traces[1].debug_tensor_value, 5) self.assertLen(traces[2].debug_tensor_value, 5) elif tensor_debug_mode == "FULL_HEALTH": self.assertLen(traces, 3) # [Placeholder:0, Unique:0 , Sum:0]. self.assertEqual(traces[0].op_type, "Placeholder") self.assertEqual(traces[0].output_slot, 0) self.assertEqual(traces[1].op_type, "Unique") self.assertEqual(traces[1].output_slot, 0) # Unique:1 is not traced under FULL_HEALTH mode, as it's int-dtype. self.assertEqual(traces[2].op_type, "Sum") self.assertEqual(traces[2].output_slot, 0) # [tensor_id, device_id, dtype, rank, element_count, # neg_inf_count, pos_inf_count, nan_count, # neg_finite_count, zero_count, pos_finite_count]. self.assertLen(traces[0].debug_tensor_value, 11) self.assertLen(traces[1].debug_tensor_value, 11) self.assertLen(traces[2].debug_tensor_value, 11) elif tensor_debug_mode == "FULL_TENSOR": # [Placeholder:0, Unique:0, Unique:1, Const:0, Sum:0]. self.assertLen(traces, 5) self.assertEqual(traces[0].op_type, "Placeholder") self.assertEqual(traces[0].output_slot, 0) self.assertIsNone(traces[0].debug_tensor_value) self.assertAllEqual( reader.graph_execution_trace_to_tensor_value(traces[0]), [2., 6., 8., 1., 2.]) self.assertEqual(traces[1].op_type, "Unique") self.assertEqual(traces[1].output_slot, 0) self.assertIsNone(traces[1].debug_tensor_value) self.assertAllEqual( reader.graph_execution_trace_to_tensor_value(traces[1]), [2., 6., 8., 1.]) self.assertEqual(traces[2].op_type, "Unique") self.assertEqual(traces[2].output_slot, 1) self.assertIsNone(traces[2].debug_tensor_value) self.assertAllEqual( reader.graph_execution_trace_to_tensor_value(traces[2]), [0, 1, 2, 3, 0]) self.assertEqual(traces[3].op_type, "Const") self.assertEqual(traces[3].output_slot, 0) self.assertIsNone(traces[3].debug_tensor_value) self.assertAllClose( reader.graph_execution_trace_to_tensor_value(traces[3]), [0]) self.assertEqual(traces[4].op_type, "Sum") self.assertEqual(traces[4].output_slot, 0) self.assertIsNone(traces[4].debug_tensor_value) self.assertAllClose( reader.graph_execution_trace_to_tensor_value(traces[4]), 17.)
def testKerasModelFitOnOneOrTwoDevices(self, distribution, tensor_debug_mode): writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) with distribution.scope(): model = keras.Sequential() model.add( keras.layers.Dense(units=10, input_shape=[5], activation="relu")) model.add(keras.layers.Dense(units=1)) model.compile(loss="mse", optimizer="sgd") batch_size = 20 x = np.ones([batch_size, 5]) y = np.ones([batch_size, 1]) epochs = 1 history = model.fit(x, y, epochs=epochs, verbose=0) self.assertLen(history.history["loss"], epochs) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() executions = reader.executions() fit_executions = [ execution.op_type for execution in executions if "_distributed_function" in execution.op_type ] self.assertLen(fit_executions, epochs) traces = reader.graph_execution_traces() num_devices = len(distribution.extended.worker_devices) device_name_0 = distribution.extended.worker_devices[0] if num_devices > 1: device_name_1 = distribution.extended.worker_devices[1] device_0_executed_op_types = [ trace.op_type for trace in traces if trace.device_name.endswith(device_name_0) ] if num_devices > 1: device_1_executed_op_types = [ trace.op_type for trace in traces if trace.device_name.endswith(device_name_1) ] self.assertIn("MatMul", device_0_executed_op_types) self.assertIn("BiasAdd", device_0_executed_op_types) self.assertIn("Relu", device_0_executed_op_types) self.assertIn("ReluGrad", device_0_executed_op_types) if num_devices > 1: # If there are two devices involved, assert the ops inside tf.functions # are executed and recorded for the equal numbers of times by the # dumping op-callback. self.assertEqual(device_0_executed_op_types.count("MatMul"), device_1_executed_op_types.count("MatMul")) self.assertEqual(device_0_executed_op_types.count("BiasAdd"), device_1_executed_op_types.count("BiasAdd")) self.assertEqual(device_0_executed_op_types.count("Relu"), device_1_executed_op_types.count("Relu")) self.assertEqual(device_0_executed_op_types.count("ReluGrad"), device_1_executed_op_types.count("ReluGrad")) if tensor_debug_mode == "NO_TENSOR": for trace in traces: self.assertEqual(trace.debug_tensor_value, []) elif tensor_debug_mode == "FULL_TENSOR": gpu_0_relu_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "Relu" and trace.device_name.endswith(device_name_0) ] self.assertTrue(gpu_0_relu_values) gpu_0_relu_grad_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "ReluGrad" and trace.device_name.endswith(device_name_0) ] self.assertTrue(gpu_0_relu_grad_values) if num_devices > 1: gpu_1_relu_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "Relu" and trace.device_name.endswith(device_name_1) ] self.assertTrue(gpu_1_relu_values) for i in range(len(gpu_0_relu_values)): self.assertEqual(gpu_0_relu_values[i].shape, gpu_1_relu_values[i].shape) gpu_1_relu_grad_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "ReluGrad" and trace.device_name.endswith(device_name_1) ] self.assertTrue(gpu_1_relu_grad_values) for i in range(len(gpu_0_relu_grad_values)): self.assertEqual(gpu_0_relu_grad_values[i].shape, gpu_1_relu_grad_values[i].shape)
def testDumpingMiniModel(self, distribution, tensor_debug_mode): with distribution.scope(): writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) mini_model = MiniModel() optimizer = gradient_descent.GradientDescentOptimizer(0.25) def train_step(): with backprop.GradientTape() as tape: loss = mini_model(array_ops.ones([1, 10])) grads = tape.gradient(loss, mini_model.weights) grads_and_vars = zip(grads, mini_model.weights) optimizer.apply_gradients(grads_and_vars) distribution.experimental_run_v2(train_step) updated_var_values = self.evaluate(mini_model.variables) num_devices = len(distribution.extended.worker_devices) assert num_devices in (1, 2) if num_devices == 1: self.assertAllEqual(0.75 * np.ones([10, 1]), updated_var_values[0]) self.assertAllEqual([0.75], updated_var_values[1]) else: self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0]) self.assertAllEqual([0.5], updated_var_values[1]) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() device_name_0 = distribution.extended.worker_devices[0] logging.info("device_name_0 = %s", device_name_0) if num_devices > 1: device_name_1 = distribution.extended.worker_devices[1] logging.info("device_name_1 = %s", device_name_1) with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() traces = reader.graph_execution_traces() # Verify graph-execution traces are available for both devices. # We don't assert MatMul occurs exactly once because the gradient of # MatMul involves MatMul. device_0_executed_op_types = [ trace.op_type for trace in traces if trace.device_name.endswith(device_name_0) ] if num_devices > 1: device_1_executed_op_types = [ trace.op_type for trace in traces if trace.device_name.endswith(device_name_1) ] self.assertIn("MatMul", device_0_executed_op_types) self.assertEqual(device_0_executed_op_types.count("BiasAdd"), 1) if num_devices > 1: self.assertIn("MatMul", device_1_executed_op_types) self.assertEqual(device_1_executed_op_types.count("BiasAdd"), 1) if tensor_debug_mode == "NO_TENSOR": for trace in traces: self.assertEqual(trace.debug_tensor_value, []) elif tensor_debug_mode == "FULL_TENSOR": device_0_matmul_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "MatMul" and trace.device_name.endswith(device_name_0) ] device_0_bias_add_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "BiasAdd" and trace.device_name.endswith(device_name_0) ] self.assertAllClose(device_0_matmul_values[0], [[10.0]]) self.assertAllClose(device_0_bias_add_values[0], [[11.0]]) if num_devices > 1: device_1_matmul_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "MatMul" and trace.device_name.endswith(device_name_1) ] device_1_bias_add_values = [ reader.graph_execution_trace_to_tensor_value(trace) for trace in traces if trace.op_type == "BiasAdd" and trace.device_name.endswith(device_name_1) ] self.assertAllClose(device_1_matmul_values[0], [[10.0]]) self.assertAllClose(device_1_bias_add_values[0], [[11.0]])