예제 #1
0
 def testMultipleEventsFromDifferentDevicesAndSameTensorName(self):
     alert_1 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
     alert_2 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1)
     alert_3 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1634, 1, 1, 1)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert_1)
     registry.register(alert_2)
     registry.register(alert_3)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Log:0",
                 1234,
                 1,
                 2,
                 2,
             ),
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:1/gpu:0",
                 "xent/Log:0",
                 1434,
                 0,
                 1,
                 1,
             ),
         ],
         registry.report(),
     )
  def testAlertingEventCallback(self):
    numerics_alert_callback = tf.test.mock.Mock()
    stream_handler = debugger_server_lib.DebuggerDataStreamHandler(
        events_writer_manager=FakeEventsWriterManager(
            self.events_written),
        numerics_alert_callback=numerics_alert_callback)
    stream_handler.on_core_metadata_event(tf.Event())

    # The stream handler receives 1 good event and 1 with an NaN value.
    stream_handler.on_value_event(
        self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary",
                                             [0] * 14))
    stream_handler.on_value_event(
        self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [
            0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]))

    # The second event should have triggered the callback.
    numerics_alert_callback.assert_called_once_with(
        numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0",
                                     "Add:0", 0, 1, 0, 0))

    # The stream handler receives an event with a -Inf value.
    numerics_alert_callback.reset_mock()
    stream_handler.on_value_event(
        self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [
            0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]))
    numerics_alert_callback.assert_called_once_with(
        numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0",
                                     "Add:0", 0, 0, 1, 0))

    # The stream handler receives an event with a +Inf value.
    numerics_alert_callback.reset_mock()
    stream_handler.on_value_event(
        self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [
            0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0
        ]))
    numerics_alert_callback.assert_called_once_with(
        numerics_alert.NumericsAlert("/job:localhost/replica:0/task:0/cpu:0",
                                     "Add:0", 0, 0, 0, 1))

    # The stream handler receives an event without any pathetic values.
    numerics_alert_callback.reset_mock()
    stream_handler.on_value_event(
        self._create_event_with_float_tensor("Add", 0, "DebugNumericSummary", [
            0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0
        ]))
    # assert_not_called is not available in Python 3.4.
    self.assertFalse(numerics_alert_callback.called)
예제 #3
0
    def testAddAlertInReverseChronologicalOrder(self):
        history = numerics_alert.NumericsAlertHistory()
        alert_1 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 10, 0,
            10)
        history.add(alert_1)

        alert_2 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1220, 20, 20,
            0)
        history.add(alert_2)

        self.assertEqual(1220, history.first_timestamp())
        self.assertEqual(1234, history.last_timestamp())
        self.assertEqual(2, history.event_count(constants.NAN_KEY))
        self.assertEqual(1, history.event_count(constants.NEG_INF_KEY))
        self.assertEqual(1, history.event_count(constants.POS_INF_KEY))
예제 #4
0
 def testConstructFromOneAlert(self):
     alert = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 10, 0,
         10)
     history = numerics_alert.NumericsAlertHistory()
     history.add(alert)
     self.assertEqual(1234, history.first_timestamp())
     self.assertEqual(1234, history.last_timestamp())
     self.assertEqual(1, history.event_count(constants.NAN_KEY))
     self.assertEqual(0, history.event_count(constants.NEG_INF_KEY))
     self.assertEqual(1, history.event_count(constants.POS_INF_KEY))
예제 #5
0
 def testMultipleEventsFromSameDeviceAndSameTensor(self):
     alert_1 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 10,
         10)
     alert_2 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1634, 5, 5, 5)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert_1)
     registry.register(alert_2)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Log:0",
                 1234,
                 1,
                 2,
                 2,
             )
         ],
         registry.report(),
     )
예제 #6
0
 def testFilterReport(self):
     alert_1 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
     alert_2 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1)
     alert_3 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Mean:0", 1634, 1, 1, 1)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert_1)
     registry.register(alert_2)
     registry.register(alert_3)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:1/gpu:0",
                 "xent/Log:0",
                 1434,
                 0,
                 1,
                 1,
             )
         ],
         registry.report(device_name_filter=r".*\/task:1\/.*"),
     )
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Mean:0",
                 1634,
                 1,
                 1,
                 1,
             )
         ],
         registry.report(tensor_name_filter=r".*Mean.*"),
     )
예제 #7
0
    def testCreateJsonableRegistry(self):
        alert = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
        registry = numerics_alert.NumericsAlertRegistry()
        registry.register(alert)

        triplet_list = registry.create_jsonable_registry()
        self.assertEqual(1, len(triplet_list))

        triplet = triplet_list[0]
        self.assertEqual("/job:worker/replica:0/task:1/gpu:0", triplet.device)
        self.assertEqual("xent/Log:0", triplet.tensor)
        self.assertListEqual([0, -1, -1],
                             list(triplet.jsonable_history["nan"]))
        self.assertListEqual([1, 1434, 1434],
                             list(triplet.jsonable_history["neg_inf"]))
        self.assertListEqual([1, 1434, 1434],
                             list(triplet.jsonable_history["pos_inf"]))
예제 #8
0
 def testSingleAlert(self):
     alert = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 10,
         10)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Log:0",
                 1234,
                 0,
                 1,
                 1,
             )
         ],
         registry.report(),
     )
예제 #9
0
    def testRegisterBeyondCapacityObeysCapacity(self):
        alert_1 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
        alert_2 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1)
        alert_3 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:2/gpu:0", "xent/Log:0", 1634, 0, 1, 1)
        alert_4 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1834, 1, 1, 1)
        registry = numerics_alert.NumericsAlertRegistry(capacity=2)
        registry.register(alert_1)
        registry.register(alert_2)
        self.assertEqual(
            [
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:0/gpu:0",
                    "xent/Log:0",
                    1234,
                    0,
                    1,
                    1,
                ),
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:1/gpu:0",
                    "xent/Log:0",
                    1434,
                    0,
                    1,
                    1,
                ),
            ],
            registry.report(),
        )

        registry.register(alert_3)
        self.assertEqual(
            [
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:0/gpu:0",
                    "xent/Log:0",
                    1234,
                    0,
                    1,
                    1,
                ),
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:1/gpu:0",
                    "xent/Log:0",
                    1434,
                    0,
                    1,
                    1,
                ),
            ],
            registry.report(),
        )

        registry.register(alert_4)
        self.assertEqual(
            [
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:0/gpu:0",
                    "xent/Log:0",
                    1234,
                    1,
                    2,
                    2,
                ),
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:1/gpu:0",
                    "xent/Log:0",
                    1434,
                    0,
                    1,
                    1,
                ),
            ],
            registry.report(),
        )