def testInfNaNMonitorOnGraphExecutionTraceCurtHealthMode(self): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader) trace_digest = debug_events_reader.GraphExecutionTraceDigest( 1234, 1, "FooOp", "FooOp_1", 2, "g1") trace = debug_events_reader.GraphExecutionTrace( trace_digest, ["g0", "g1"], debug_event_pb2.TensorDebugMode.CURT_HEALTH, debug_tensor_value=[9, 1]) # [tensor_id, any_inf_nan]. monitor.on_graph_execution_trace(55, trace) self.assertLen(monitor.alerts(), 1) alert = monitor.alerts()[0] self.assertEqual(alert.wall_time, 1234) self.assertEqual(alert.op_type, "FooOp") self.assertEqual(alert.output_slot, 2) # The four fields below are unavailable under CURT_HEALTH mode by design. self.assertIsNone(alert.size) self.assertIsNone(alert.num_neg_inf) self.assertIsNone(alert.num_pos_inf) self.assertIsNone(alert.num_nan) self.assertIsNone(alert.execution_index) self.assertEqual(alert.graph_execution_trace_index, 55)
def testInfNaNMonitorOnGraphExecutionTraceConciseHealthMode(self): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader) trace_digest = debug_events_reader.GraphExecutionTraceDigest( 1234, 1, "FooOp", "FooOp_1", 2, "g1") trace = debug_events_reader.GraphExecutionTrace( trace_digest, ["g0", "g1"], debug_event_pb2.TensorDebugMode.CONCISE_HEALTH, # [tensor_id, size, num_neg_inf, num_pos_inf, num_nan]. debug_tensor_value=[9, 100, 3, 2, 1]) monitor.on_graph_execution_trace(55, trace) self.assertLen(monitor.alerts(), 1) alert = monitor.alerts()[0] self.assertEqual(alert.wall_time, 1234) self.assertEqual(alert.op_type, "FooOp") self.assertEqual(alert.output_slot, 2) self.assertEqual(alert.size, 100) self.assertEqual(alert.num_neg_inf, 3) self.assertEqual(alert.num_pos_inf, 2) self.assertEqual(alert.num_nan, 1) self.assertEqual(alert.graph_execution_trace_index, 55)
def testInfNanMonitorOnExecutionUnderFullTensorModeWorks( self, tensor_value, dtype, expected_size, expected_num_neg_inf, expected_num_pos_inf, expected_num_nan): mock_reader = test.mock.MagicMock() mock_reader.execution_to_tensor_values.return_value = [ np.array([[0.0, -1.0, 1.0]]), np.array(tensor_value, dtype=dtype) ] monitor = debug_events_monitors.InfNanMonitor(mock_reader) execution_digest = debug_events_reader.ExecutionDigest( 1234, 1, "__inference_bar_function_1234", output_tensor_device_ids=[0, 1]) execution = debug_events_reader.Execution( execution_digest, "worker01", ["a1", "b2", "e3"], debug_event_pb2.TensorDebugMode.FULL_TENSOR, graph_id=None, input_tensor_ids=[12, 34], output_tensor_ids=[56, 78]) monitor.on_execution(70, execution) if expected_num_neg_inf or expected_num_pos_inf or expected_num_nan: self.assertLen(monitor.alerts(), 1) alert = monitor.alerts()[0] self.assertEqual(alert.wall_time, 1234) self.assertEqual(alert.op_type, "__inference_bar_function_1234") self.assertEqual(alert.output_slot, 1) self.assertEqual(alert.size, expected_size) self.assertEqual(alert.num_neg_inf, expected_num_neg_inf) self.assertEqual(alert.num_pos_inf, expected_num_pos_inf) self.assertEqual(alert.num_nan, expected_num_nan) self.assertEqual(alert.execution_index, 70) self.assertIsNone(alert.graph_execution_trace_index, 70) else: self.assertEmpty(monitor.alerts())
def testInfNanMonitorStartsWithEmptyAlerts(self): mock_reader = test.mock.MagicMock() monitor = debug_events_monitors.InfNanMonitor(mock_reader) self.assertEmpty(monitor.alerts())