def testLimitingInfNanMonitorAlertCountWorks(self): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader, limit=3) for i in range(10): execution_digest = debug_events_reader.ExecutionDigest( i * 1000, 1, "FooOp", output_tensor_device_ids=[0, 1]) execution = debug_events_reader.Execution( execution_digest, "worker01", ["a1", "b2", "e3"], debug_event_pb2.TensorDebugMode.CURT_HEALTH, graph_id=None, input_tensor_ids=[12, 34], output_tensor_ids=[56, 78], debug_tensor_values=[[-1, 0], [-1, 1]]) # [tensor_id, any_inf_nan]. monitor.on_execution(i, execution) alerts = monitor.alerts() self.assertLen(alerts, 3) for i, alert in enumerate(alerts): self.assertEqual(alert.wall_time, i * 1000) self.assertEqual(alert.op_type, "FooOp") self.assertEqual(alert.output_slot, 1) # The four fields below are unavailable under CURT_HEALTH mode by design. self.assertIsNone(alert.size) self.assertIsNone(alert.num_neg_inf) self.assertIsNone(alert.num_pos_inf) self.assertIsNone(alert.num_nan) self.assertEqual(alert.execution_index, i) self.assertIsNone(alert.graph_execution_trace_index)
def testInfNanMonitorOnExecutionUnderHealthMode(self, tensor_debug_mode, debug_tensor_values): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader) execution_digest = debug_events_reader.ExecutionDigest( 1234, 1, "BarOp", output_tensor_device_ids=[0, 1]) execution = debug_events_reader.Execution( execution_digest, "worker01", ["a1", "b2", "e3"], tensor_debug_mode, graph_id=None, input_tensor_ids=[12, 34], output_tensor_ids=[56, 78], debug_tensor_values=debug_tensor_values) monitor.on_execution(60, execution) self.assertLen(monitor.alerts(), 1) alert = monitor.alerts()[0] self.assertEqual(alert.wall_time, 1234) self.assertEqual(alert.op_type, "BarOp") self.assertEqual(alert.output_slot, 0) self.assertEqual(alert.size, 10) self.assertEqual(alert.num_neg_inf, 1) self.assertEqual(alert.num_pos_inf, 2) self.assertEqual(alert.num_nan, 3) self.assertEqual(alert.execution_index, 60) self.assertIsNone(alert.graph_execution_trace_index)
def testInfNanMonitorOnExecutionUnderCurtHealthMode(self): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader) execution_digest = debug_events_reader.ExecutionDigest( 1234, 1, "FooOp", output_tensor_device_ids=[0, 1]) execution = debug_events_reader.Execution( execution_digest, "worker01", ["a1", "b2", "e3"], debug_event_pb2.TensorDebugMode.CURT_HEALTH, graph_id=None, input_tensor_ids=[12, 34], output_tensor_ids=[56, 78], debug_tensor_values=[[-1, 0], [-1, 1]]) # [tensor_id, any_inf_nan]. monitor.on_execution(50, execution) self.assertLen(monitor.alerts(), 1) alert = monitor.alerts()[0] self.assertEqual(alert.wall_time, 1234) self.assertEqual(alert.op_type, "FooOp") self.assertEqual(alert.output_slot, 1) # The four fields below are unavailable under CURT_HEALTH mode by design. self.assertIsNone(alert.size) self.assertIsNone(alert.num_neg_inf) self.assertIsNone(alert.num_pos_inf) self.assertIsNone(alert.num_nan) self.assertEqual(alert.execution_index, 50) self.assertIsNone(alert.graph_execution_trace_index)
def testExecutionNoGraphNoInputButWithOutputToJson(self): execution_digest = debug_events_reader.ExecutionDigest( 1234, 5678, "FooOp", output_tensor_device_ids=[1357]) execution = debug_events_reader.Execution( execution_digest, "localhost", ("a1", "b2"), debug_event_pb2.TensorDebugMode.FULL_HEALTH, graph_id="abcd", input_tensor_ids=[13, 37], output_tensor_ids=None, debug_tensor_values=None) json = execution.to_json() self.jsonRoundTripCheck(json) self.assertEqual(json["wall_time"], 1234) self.assertEqual(json["op_type"], "FooOp") self.assertEqual(json["output_tensor_device_ids"], (1357,)) self.assertEqual(json["host_name"], "localhost") self.assertEqual(json["stack_frame_ids"], ("a1", "b2")) self.assertEqual(json["tensor_debug_mode"], debug_event_pb2.TensorDebugMode.FULL_HEALTH) self.assertEqual(json["graph_id"], "abcd") self.assertEqual(json["input_tensor_ids"], (13, 37)) self.assertIsNone(json["output_tensor_ids"]) self.assertIsNone(json["debug_tensor_values"])
def testExecutionDigestWithTwoOutputsToJson(self): execution_digest = debug_events_reader.ExecutionDigest( 1234, 5678, "FooOp", output_tensor_device_ids=[1357, 2468]) json = execution_digest.to_json() self.jsonRoundTripCheck(json) self.assertEqual(json["wall_time"], 1234) self.assertEqual(json["op_type"], "FooOp") self.assertEqual(json["output_tensor_device_ids"], (1357, 2468))
def testExecutionWithNoOutputTensorsReturnsZeroForNumOutputs( self, output_tensor_ids): execution = debug_events_reader.Execution( debug_events_reader.ExecutionDigest(1234, 5678, "FooOp"), "localhost", ("a1", "b2"), debug_event_pb2.TensorDebugMode.FULL_HEALTH, graph_id="abcd", input_tensor_ids=[13, 37], output_tensor_ids=output_tensor_ids, debug_tensor_values=None) self.assertEqual(execution.num_outputs, 0)
def testInfNanMonitorOnExecutionUnderModeWithNoInfNanInfo( self, tensor_debug_mode, debug_tensor_values): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader) execution_digest = debug_events_reader.ExecutionDigest( 1234, 1, "BarOp", output_tensor_device_ids=[0, 1]) execution = debug_events_reader.Execution( execution_digest, "worker01", ["a1", "b2", "e3"], tensor_debug_mode, graph_id=None, input_tensor_ids=[12, 34], output_tensor_ids=[56, 78], debug_tensor_values=debug_tensor_values) monitor.on_execution(60, execution) self.assertEmpty(monitor.alerts())
def testInfNanMonitorOnExecutionUnderFullTensorModeWorks( self, tensor_value, dtype, expected_size, expected_num_neg_inf, expected_num_pos_inf, expected_num_nan): mock_reader = test.mock.MagicMock() mock_reader.execution_to_tensor_values.return_value = [ np.array([[0.0, -1.0, 1.0]]), np.array(tensor_value, dtype=dtype) ] monitor = debug_events_monitors.InfNanMonitor(mock_reader) execution_digest = debug_events_reader.ExecutionDigest( 1234, 1, "__inference_bar_function_1234", output_tensor_device_ids=[0, 1]) execution = debug_events_reader.Execution( execution_digest, "worker01", ["a1", "b2", "e3"], debug_event_pb2.TensorDebugMode.FULL_TENSOR, graph_id=None, input_tensor_ids=[12, 34], output_tensor_ids=[56, 78]) monitor.on_execution(70, execution) if expected_num_neg_inf or expected_num_pos_inf or expected_num_nan: self.assertLen(monitor.alerts(), 1) alert = monitor.alerts()[0] self.assertEqual(alert.wall_time, 1234) self.assertEqual(alert.op_type, "__inference_bar_function_1234") self.assertEqual(alert.output_slot, 1) self.assertEqual(alert.size, expected_size) self.assertEqual(alert.num_neg_inf, expected_num_neg_inf) self.assertEqual(alert.num_pos_inf, expected_num_pos_inf) self.assertEqual(alert.num_nan, expected_num_nan) self.assertEqual(alert.execution_index, 70) self.assertIsNone(alert.graph_execution_trace_index, 70) else: self.assertEmpty(monitor.alerts())