def testNumericsAlertReportResponse(self):
    """Tests that reports of bad values are returned."""
    alerts = [
        numerics_alert.NumericsAlertReportRow('cpu0', 'MatMul', 123, 2, 3, 4),
        numerics_alert.NumericsAlertReportRow('cpu1', 'Add', 124, 5, 6, 7),
    ]
    self.mock_debugger_data_server.numerics_alert_report.return_value = alerts
    response = self.server.get('/data/plugin/debugger/numerics_alert_report')
    self.assertEqual(200, response.status_code)

    retrieved_alerts = self._DeserializeResponse(response.get_data())
    self.assertEqual(2, len(retrieved_alerts))
    self.assertDictEqual({
        'device_name': 'cpu0',
        'tensor_name': 'MatMul',
        'first_timestamp': 123,
        'nan_event_count': 2,
        'neg_inf_event_count': 3,
        'pos_inf_event_count': 4,
    }, retrieved_alerts[0])
    self.assertDictEqual({
        'device_name': 'cpu1',
        'tensor_name': 'Add',
        'first_timestamp': 124,
        'nan_event_count': 5,
        'neg_inf_event_count': 6,
        'pos_inf_event_count': 7,
    }, retrieved_alerts[1])
Пример #2
0
 def testMultipleEventsFromDifferentDevicesAndSameTensorName(self):
     alert_1 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
     alert_2 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1)
     alert_3 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1634, 1, 1, 1)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert_1)
     registry.register(alert_2)
     registry.register(alert_3)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Log:0",
                 1234,
                 1,
                 2,
                 2,
             ),
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:1/gpu:0",
                 "xent/Log:0",
                 1434,
                 0,
                 1,
                 1,
             ),
         ],
         registry.report(),
     )
Пример #3
0
 def testLoadFromJson(self):
     registry = numerics_alert.NumericsAlertRegistry(initialization_list=[
         [
             "/job:localhost/replica:0/task:0/cpu:0",
             "MatMul:0",
             {
                 "pos_inf": [0, -1, -1],
                 "nan": [1624, 1496818651573005, 1496818690371163],
                 "neg_inf": [0, -1, -1],
             },
         ],
         [
             "/job:localhost/replica:0/task:0/cpu:0",
             "weight/Adagrad:0",
             {
                 "pos_inf": [0, -1, -1],
                 "nan": [1621, 1496818651607234, 1496818690370891],
                 "neg_inf": [0, -1, -1],
             },
         ],
     ])
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:localhost/replica:0/task:0/cpu:0",
                 "MatMul:0",
                 1496818651573005,
                 1624,
                 0,
                 0,
             ),
             numerics_alert.NumericsAlertReportRow(
                 "/job:localhost/replica:0/task:0/cpu:0",
                 "weight/Adagrad:0",
                 1496818651607234,
                 1621,
                 0,
                 0,
             ),
         ],
         registry.report(),
     )
Пример #4
0
 def testFilterReport(self):
     alert_1 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
     alert_2 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1)
     alert_3 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Mean:0", 1634, 1, 1, 1)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert_1)
     registry.register(alert_2)
     registry.register(alert_3)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:1/gpu:0",
                 "xent/Log:0",
                 1434,
                 0,
                 1,
                 1,
             )
         ],
         registry.report(device_name_filter=r".*\/task:1\/.*"),
     )
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Mean:0",
                 1634,
                 1,
                 1,
                 1,
             )
         ],
         registry.report(tensor_name_filter=r".*Mean.*"),
     )
Пример #5
0
    def testNumericsAlertReportResponse(self):
        """Tests that reports of bad values are returned."""
        alerts = [
            numerics_alert.NumericsAlertReportRow("cpu0", "MatMul", 123, 2, 3,
                                                  4),
            numerics_alert.NumericsAlertReportRow("cpu1", "Add", 124, 5, 6, 7),
        ]
        self.mock_debugger_data_server.numerics_alert_report.return_value = (
            alerts)
        response = self.server.get(
            "/data/plugin/debugger/numerics_alert_report")
        self.assertEqual(200, response.status_code)

        retrieved_alerts = self._DeserializeResponse(response.get_data())
        self.assertEqual(2, len(retrieved_alerts))
        self.assertDictEqual(
            {
                "device_name": "cpu0",
                "tensor_name": "MatMul",
                "first_timestamp": 123,
                "nan_event_count": 2,
                "neg_inf_event_count": 3,
                "pos_inf_event_count": 4,
            },
            retrieved_alerts[0],
        )
        self.assertDictEqual(
            {
                "device_name": "cpu1",
                "tensor_name": "Add",
                "first_timestamp": 124,
                "nan_event_count": 5,
                "neg_inf_event_count": 6,
                "pos_inf_event_count": 7,
            },
            retrieved_alerts[1],
        )
Пример #6
0
 def testSingleAlert(self):
     alert = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 10,
         10)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Log:0",
                 1234,
                 0,
                 1,
                 1,
             )
         ],
         registry.report(),
     )
Пример #7
0
 def testMultipleEventsFromSameDeviceAndSameTensor(self):
     alert_1 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 10,
         10)
     alert_2 = numerics_alert.NumericsAlert(
         "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1634, 5, 5, 5)
     registry = numerics_alert.NumericsAlertRegistry()
     registry.register(alert_1)
     registry.register(alert_2)
     self.assertEqual(
         [
             numerics_alert.NumericsAlertReportRow(
                 "/job:worker/replica:0/task:0/gpu:0",
                 "xent/Log:0",
                 1234,
                 1,
                 2,
                 2,
             )
         ],
         registry.report(),
     )
Пример #8
0
    def testRegisterBeyondCapacityObeysCapacity(self):
        alert_1 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:1/gpu:0", "xent/Log:0", 1434, 0, 1, 1)
        alert_2 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1234, 0, 1, 1)
        alert_3 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:2/gpu:0", "xent/Log:0", 1634, 0, 1, 1)
        alert_4 = numerics_alert.NumericsAlert(
            "/job:worker/replica:0/task:0/gpu:0", "xent/Log:0", 1834, 1, 1, 1)
        registry = numerics_alert.NumericsAlertRegistry(capacity=2)
        registry.register(alert_1)
        registry.register(alert_2)
        self.assertEqual(
            [
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:0/gpu:0",
                    "xent/Log:0",
                    1234,
                    0,
                    1,
                    1,
                ),
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:1/gpu:0",
                    "xent/Log:0",
                    1434,
                    0,
                    1,
                    1,
                ),
            ],
            registry.report(),
        )

        registry.register(alert_3)
        self.assertEqual(
            [
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:0/gpu:0",
                    "xent/Log:0",
                    1234,
                    0,
                    1,
                    1,
                ),
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:1/gpu:0",
                    "xent/Log:0",
                    1434,
                    0,
                    1,
                    1,
                ),
            ],
            registry.report(),
        )

        registry.register(alert_4)
        self.assertEqual(
            [
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:0/gpu:0",
                    "xent/Log:0",
                    1234,
                    1,
                    2,
                    2,
                ),
                numerics_alert.NumericsAlertReportRow(
                    "/job:worker/replica:0/task:1/gpu:0",
                    "xent/Log:0",
                    1434,
                    0,
                    1,
                    1,
                ),
            ],
            registry.report(),
        )