def testReadingTwoFileSetsWithTheSameDumpRootSucceeds(self): # To simulate a multi-host data dump, we first generate file sets in two # different directories, with the same tfdbg_run_id, and then combine them. tfdbg_run_id = "foo" for i in range(2): writer = debug_events_writer.DebugEventsWriter( os.path.join(self.dump_root, str(i)), tfdbg_run_id, circular_buffer_size=-1) if i == 0: debugged_graph = debug_event_pb2.DebuggedGraph( graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) op_name = "Op_0" graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) op_name = "Op_1" graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) for _ in range(10): trace = debug_event_pb2.GraphExecutionTrace( op_name="Op_%d" % i, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() # Move all files from the subdirectory /1 to subdirectory /0. dump_root_0 = os.path.join(self.dump_root, "0") src_paths = glob.glob(os.path.join(self.dump_root, "1", "*")) for src_path in src_paths: dst_path = os.path.join( dump_root_0, # Rename the file set to avoid file name collision. re.sub(r"(tfdbg_events\.\d+)", r"\g<1>1", os.path.basename(src_path))) os.rename(src_path, dst_path) with debug_events_reader.DebugDataReader(dump_root_0) as reader: reader.update() # Verify the content of the .graph_execution_traces file. trace_digests = reader.graph_execution_traces(digest=True) self.assertLen(trace_digests, 20) for _ in range(10): trace = reader.read_graph_execution_trace(trace_digests[i]) self.assertEqual(trace.op_name, "Op_0") for _ in range(10): trace = reader.read_graph_execution_trace(trace_digests[i + 10]) self.assertEqual(trace.op_name, "Op_1")
def callback(self, op_type, inputs, attrs, outputs, op_name=None, graph=None): """Op callback for tracing (dumping) a TF program's execution.""" del attrs # Unused writer = self.get_writer() if graph: context_id = self._get_context_id(graph) assert op_name is not None output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs)) graph_op_creation = debug_event_pb2.GraphOpCreation( op_type=op_type, op_name=op_name, graph_name=graph.name if hasattr(graph, "name") else None, graph_id=context_id, input_names=[input_tensor.name for input_tensor in inputs], num_outputs=len(outputs), output_tensor_ids=output_tensor_ids, code_location=self._process_stack_frames()) writer.WriteGraphOpCreation(graph_op_creation) if outputs and compat.as_bytes( op_type) not in op_callbacks_common.OP_CALLBACK_SKIP_OPS: return self._instrument_symbolic_tensors( outputs, op_type, op_name, context_id, output_tensor_ids) else: input_ids = [t._id for t in inputs] # pylint:disable=protected-access writer.WriteExecution( self._dump_eager_tensors(outputs, op_type, input_ids))
def testWriteGraphOpCreationAndDebuggedGraphs(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root) num_op_creations = 10 for i in range(num_op_creations): graph_op_creation = debug_event_pb2.GraphOpCreation() graph_op_creation.op_type = "Conv2D" graph_op_creation.op_name = "Conv2D_%d" % i writer.WriteGraphOpCreation(graph_op_creation) debugged_graph = debug_event_pb2.DebuggedGraph() debugged_graph.graph_id = "deadbeaf" debugged_graph.graph_name = "MyGraph1" writer.WriteDebuggedGraph(debugged_graph) writer.FlushNonExecutionFiles() source_files_paths = glob.glob(os.path.join(self.dump_root, "*.graphs")) self.assertEqual(len(source_files_paths), 1) actuals = ReadDebugEvents(source_files_paths[0]) self.assertEqual(len(actuals), num_op_creations + 1) for i in range(num_op_creations): self.assertEqual(actuals[i].graph_op_creation.op_type, "Conv2D") self.assertEqual(actuals[i].graph_op_creation.op_name, "Conv2D_%d" % i) self.assertEqual(actuals[num_op_creations].debugged_graph.graph_id, "deadbeaf")
def testRangeReadingGraphExecutionTraces(self, begin, end, expected_begin, expected_end): writer = debug_events_writer.DebugEventsWriter( self.dump_root, self.tfdbg_run_id, circular_buffer_size=-1) debugged_graph = debug_event_pb2.DebuggedGraph( graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) for i in range(5): op_name = "Op_%d" % i graph_op_creation = debug_event_pb2.GraphOpCreation( op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() writer.Close() with debug_events_reader.DebugDataReader(self.dump_root) as reader: reader.update() traces = reader.graph_execution_traces(begin=begin, end=end) self.assertLen(traces, expected_end - expected_begin) self.assertEqual(traces[0].op_name, "Op_%d" % expected_begin) self.assertEqual(traces[-1].op_name, "Op_%d" % (expected_end - 1))
def WriteGraphOpCreation(): graph_op_creation = debug_event_pb2.GraphOpCreation() with graph_op_state["lock"]: graph_op_creation.op_name = "Op%d" % graph_op_state["counter"] graph_op_state["counter"] += 1 writer.WriteGraphOpCreation(graph_op_creation) # More-frequent-than-necessary concurrent flushing is not recommended, # but tolerated. writer.FlushNonExecutionFiles()
def callback(self, op_type, inputs, attrs, outputs, op_name=None, graph=None): """Op callback for tracing (dumping) a TF program's execution.""" del attrs # Unused writer = self.get_writer() if graph: is_v1_graph_mode = not ops.executing_eagerly_outside_functions() context_id = self._get_context_id(graph) # Innermost context ID. output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs)) if op_type in ("Const", "Placeholder", "PlaceholderWithDefault"): # In some cases, the op name of a Const or Placeholder op in a graph # can be duplicate (e.g., `None` or "resource"). # When this happens, we use the output tensor name to infer # the non-duplicated tensor name. op_name = outputs[0].name.split(":")[0] if is_v1_graph_mode: for input_tensor in inputs: if input_tensor in self._placeholder_to_debug_tensor and outputs: outputs[0].op._add_control_input( # pylint: disable=protected-access self._placeholder_to_debug_tensor[input_tensor].op) graph_op_creation = debug_event_pb2.GraphOpCreation( op_type=op_type, op_name=op_name, graph_name=graph.name if hasattr(graph, "name") else None, graph_id=context_id, input_names=[ self._lookup_tensor_name(input_tensor) for input_tensor in inputs ], num_outputs=len(outputs), output_tensor_ids=output_tensor_ids, code_location=self._process_stack_frames()) writer.WriteGraphOpCreation(graph_op_creation) if outputs and compat.as_bytes( op_type) not in op_callbacks_common.OP_CALLBACK_SKIP_OPS: return self._instrument_symbolic_tensors( outputs, op_type, op_name, context_id, output_tensor_ids) else: op_type_bytes = compat.as_bytes(op_type) if op_type_bytes == b"DebugNumericSummaryV2": # TODO(b/140334369): Remove this special casing logic once op_callback. # automatically prevents infinite recursion in eager mode. return None if op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS: return None context_id = self._func_graph_id_from_func_name(op_type) input_ids = [t._id for t in inputs] # pylint:disable=protected-access output_tensor_device_ids = [writer.RegisterDeviceAndGetId(output.device) for output in outputs] if outputs else [] writer.WriteExecution(self._dump_eager_tensors( outputs, op_type, input_ids, output_tensor_device_ids, graph_id=context_id))
def write_graph_execution_trace(): with graph_execution_trace_state["lock"]: op_name = "Op%d" % graph_execution_trace_state["counter"] graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") graph_execution_trace_state["counter"] += 1 writer.WriteGraphOpCreation(graph_op_creation) writer.WriteGraphExecutionTrace(trace)
def write_and_update_job(): while True: if writer_state["done"]: break op_name = "Op%d" % writer_state["counter"] graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer_state["counter"] += 1 writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() reader.update()
def callback(self, op_type, inputs, attrs, outputs, op_name=None, graph=None): """Op callback for tracing (dumping) a TF program's execution.""" del attrs # Unused writer = self.get_writer() if graph: context_id = self._get_context_id(graph) # Innermost context ID. assert op_name is not None output_tensor_ids = self._get_symbolic_tensor_ids(len(outputs)) graph_op_creation = debug_event_pb2.GraphOpCreation( op_type=op_type, op_name=op_name, graph_name=graph.name if hasattr(graph, "name") else None, graph_id=context_id, input_names=[input_tensor.name for input_tensor in inputs], num_outputs=len(outputs), output_tensor_ids=output_tensor_ids, code_location=self._process_stack_frames()) writer.WriteGraphOpCreation(graph_op_creation) if outputs and compat.as_bytes( op_type) not in op_callbacks_common.OP_CALLBACK_SKIP_OPS: return self._instrument_symbolic_tensors( outputs, op_type, op_name, context_id, output_tensor_ids) else: op_type_bytes = compat.as_bytes(op_type) if op_type_bytes == b"DebugNumericSummaryV2": # TODO(b/140334369): Remove this special casing logic once op_callback. # automatically prevents infinite recursion in eager mode. return None if op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS: return None context_id = self._func_graph_id_from_func_name(op_type) input_ids = [t._id for t in inputs] # pylint:disable=protected-access writer.WriteExecution( self._dump_eager_tensors(outputs, op_type, input_ids, graph_id=context_id))
def testConcurrentGraphExecutionTraceRandomReads(self): circular_buffer_size = -1 writer = debug_events_writer.DebugEventsWriter(self.dump_root, self.tfdbg_run_id, circular_buffer_size) debugged_graph = debug_event_pb2.DebuggedGraph(graph_id="graph1", graph_name="graph1") writer.WriteDebuggedGraph(debugged_graph) for i in range(100): op_name = "Op%d" % i graph_op_creation = debug_event_pb2.GraphOpCreation( op_type="FooOp", op_name=op_name, graph_id="graph1") writer.WriteGraphOpCreation(graph_op_creation) trace = debug_event_pb2.GraphExecutionTrace( op_name=op_name, tfdbg_context_id="graph1") writer.WriteGraphExecutionTrace(trace) writer.FlushNonExecutionFiles() writer.FlushExecutionFiles() reader = debug_events_reader.DebugDataReader(self.dump_root) reader.update() traces = [None] * 100 def read_job_1(): digests = reader.graph_execution_traces(digest=True) for i in range(49, -1, -1): traces[i] = reader.read_graph_execution_trace(digests[i]) def read_job_2(): digests = reader.graph_execution_traces(digest=True) for i in range(99, 49, -1): traces[i] = reader.read_graph_execution_trace(digests[i]) thread_1 = threading.Thread(target=read_job_1) thread_2 = threading.Thread(target=read_job_2) thread_1.start() thread_2.start() thread_1.join() thread_2.join() for i in range(100): self.assertEqual(traces[i].op_name, "Op%d" % i)
def testWriteGraphOpCreationAndDebuggedGraphs(self): writer = debug_events_writer.DebugEventsWriter(self.dump_root) num_op_creations = 10 for i in range(num_op_creations): graph_op_creation = debug_event_pb2.GraphOpCreation() graph_op_creation.op_type = "Conv2D" graph_op_creation.op_name = "Conv2D_%d" % i writer.WriteGraphOpCreation(graph_op_creation) debugged_graph = debug_event_pb2.DebuggedGraph() debugged_graph.graph_id = "deadbeaf" debugged_graph.graph_name = "MyGraph1" writer.WriteDebuggedGraph(debugged_graph) writer.FlushNonExecutionFiles() reader = debug_events_reader.DebugEventsReader(self.dump_root) actuals = list(reader.graphs_iterator()) self.assertLen(actuals, num_op_creations + 1) for i in range(num_op_creations): self.assertEqual(actuals[i].graph_op_creation.op_type, "Conv2D") self.assertEqual(actuals[i].graph_op_creation.op_name, "Conv2D_%d" % i) self.assertEqual(actuals[num_op_creations].debugged_graph.graph_id, "deadbeaf")