示例#1
0
def raise_child_failure_error_fn(name, child_error_file=""):
    if child_error_file:
        _write_error(SentinelError("foobar"), child_error_file)
    pf = ProcessFailure(local_rank=0,
                        pid=997,
                        exitcode=1,
                        error_file=child_error_file)
    raise ChildFailedError(name, {0: pf})
示例#2
0
    def test_dump_error_file_overwrite_existing(self):
        dst_error_file = os.path.join(self.test_dir, "dst_error.json")
        src_error_file = os.path.join(self.test_dir, "src_error.json")
        _write_error(RuntimeError("foo"), dst_error_file)
        _write_error(RuntimeError("bar"), src_error_file)

        with patch.dict(os.environ, {"TORCHELASTIC_ERROR_FILE": dst_error_file}):
            eh = ErrorHandler()
            eh.dump_error_file(src_error_file)
            self.assertTrue(filecmp.cmp(src_error_file, dst_error_file))
示例#3
0
    def test_copy_error_file(self):
        src_error_file = os.path.join(self.test_dir, "src_error.json")
        _write_error(RuntimeError("foobar"), src_error_file)

        with patch.dict(os.environ,
                        {"TORCHELASTIC_ERROR_FILE": self.test_error_file}):
            eh = ErrorHandler()
            eh.copy_error_file(src_error_file)
            self.assertTrue(filecmp.cmp(src_error_file, self.test_error_file))

        with patch.dict(os.environ, {}):
            eh = ErrorHandler()
            eh.copy_error_file(src_error_file)
示例#4
0
    def test_get_failures(self, log_mock):
        with mock.patch("time.time", side_effect=[3, 2, 1]):
            error_file0 = os.path.join(self.test_dir, "error0.json")
            error_file1 = os.path.join(self.test_dir, "error1.json")
            _write_error(RuntimeError("error 0"), error_file0)
            _write_error(RuntimeError("error 1"), error_file1)

            fail0 = ProcessFailure(
                local_rank=0, pid=997, exitcode=1, error_file=error_file0
            )
            fail1 = ProcessFailure(
                local_rank=1, pid=998, exitcode=3, error_file=error_file1
            )
            fail2 = ProcessFailure(
                local_rank=2, pid=999, exitcode=15, error_file="no_exist.json"
            )

            self.assertEqual(3, fail0.timestamp)
            self.assertEqual(2, fail1.timestamp)
            self.assertEqual(1, fail2.timestamp)
示例#5
0
 def failure_with_error_file(self, exception):
     _write_error(exception, self.test_error_file)
     return ProcessFailure(local_rank=0,
                           pid=997,
                           exitcode=1,
                           error_file=self.test_error_file)