def test_function_large_ret_val(self): # python multiprocessing.queue module uses pipes and actually PipedQueues # This means that if a single object is greater than a pipe size # the writer process will block until reader process will start # reading the pipe. # This test makes a worker fn to return huge output, around ~10 MB size = 200000 for start_method in start_methods(): with self.subTest(start_method=start_method): pc = start_processes( name="echo", entrypoint=echo_large, args={ 0: (size, ), 1: (size, ), 2: (size, ), 3: (size, ) }, envs={ 0: {}, 1: {}, 2: {}, 3: {} }, log_dir=self.log_dir(), start_method=start_method, ) results = pc.wait(period=0.1) for i in range(pc.nprocs): self.assertEqual(size, len(results.return_values[i]))
def test_function_signal(self): """ run 2x copies of echo3, induce a segfault on first """ SEGFAULT = True for start_method, redirs in product(start_methods(), redirects()): with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="echo", entrypoint=echo3, args={0: ("hello", SEGFAULT), 1: ("world",)}, envs={0: {}, 1: {}}, log_dir=log_dir, start_method=start_method, redirects=redirs, ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertEqual(1, len(results.failures)) self.assertFalse(results.return_values) failure = results.failures[0] error_file = failure.error_file self.assertEqual(-signal.SIGSEGV, failure.exitcode) self.assertEqual("SIGSEGV", failure.signal_name()) self.assertEqual(pc.pids()[0], failure.pid) self.assertEqual(os.path.join(log_dir, "0", "error.json"), error_file)
def test_function(self): for start_method, redirs in product(start_methods(), redirects()): with self.subTest(start_method=start_method, redirs=redirs): pc = start_processes( name="echo", entrypoint=echo1, args={0: ("hello",), 1: ("hello",)}, envs={0: {"RANK": "0"}, 1: {"RANK": "1"}}, log_dir=self.log_dir(), start_method=start_method, redirects=redirs, ) results = pc.wait(period=0.1) nprocs = pc.nprocs self.assert_pids_noexist(pc.pids()) self.assertEqual( {i: f"hello_{i}" for i in range(nprocs)}, results.return_values ) for i in range(nprocs): if redirs & Std.OUT != Std.OUT: self.assertFalse(results.stdouts[i]) if redirs & Std.ERR != Std.ERR: self.assertFalse(results.stderrs[i]) if redirs & Std.OUT == Std.OUT: self.assert_in_file( [f"hello stdout from {i}"], results.stdouts[i] ) if redirs & Std.ERR == Std.ERR: self.assert_in_file( [f"hello stderr from {i}"], results.stderrs[i] )
def test_function_exit(self): """ run 2x copies of echo1 fail (exit) the first functions that exit from python do not generate an error file (even if they are decorated with @record) """ FAIL = 138 for start_method in start_methods(): with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="echo", entrypoint=echo1, args={ 0: ("hello", FAIL), 1: ("hello", ) }, envs={ 0: { "RANK": "0" }, 1: { "RANK": "1" } }, log_dir=log_dir, start_method=start_method, redirects={0: Std.ERR}, ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertTrue(results.is_failed()) self.assertEqual(1, len(results.failures)) self.assertFalse(results.return_values) failure = results.failures[0] error_file = failure.error_file self.assertEqual(FAIL, failure.exitcode) self.assertEqual("<N/A>", failure.signal_name()) self.assertEqual(pc.pids()[0], failure.pid) self.assertEqual("<N/A>", error_file) self.assertEqual(f"Process failed with exitcode {FAIL}", failure.message) self.assertLessEqual(failure.timestamp, int(time.time())) self.assert_in_file([f"exit {FAIL} from 0"], results.stderrs[0]) self.assertFalse(results.stdouts[0]) self.assertFalse(results.stderrs[1]) self.assertFalse(results.stdouts[1]) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())
def test_void_function(self): for start_method in start_methods(): with self.subTest(start_method=start_method): pc = start_processes( name="echo", entrypoint=echo0, args={0: ("hello",), 1: ("world",)}, envs={0: {}, 1: {}}, log_dir=self.log_dir(), start_method=start_method, ) results = pc.wait(period=0.1) self.assertEqual({0: None, 1: None}, results.return_values)
def test_function_raise(self): """ run 2x copies of echo2, raise an exception on the first """ RAISE = True for start_method in start_methods(): with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="echo", entrypoint=echo2, args={ 0: ("hello", RAISE), 1: ("world", ) }, envs={ 0: {}, 1: {} }, log_dir=log_dir, start_method=start_method, ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertEqual(1, len(results.failures)) self.assertFalse(results.return_values) failure = results.failures[0] error_file = failure.error_file error_file_data = failure.error_file_data self.assertEqual(1, failure.exitcode) self.assertEqual("<N/A>", failure.signal_name()) self.assertEqual(pc.pids()[0], failure.pid) self.assertEqual(os.path.join(log_dir, "0", "error.json"), error_file) self.assertEqual( int(error_file_data["message"]["extraInfo"]["timestamp"]), int(failure.timestamp), ) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())
def test_function_redirect_and_tee(self): for start_method in start_methods(): with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="trainer", entrypoint=echo1, args={ 0: ("hello", ), 1: ("world", ) }, envs={ 0: { "RANK": "0" }, 1: { "RANK": "1" } }, log_dir=log_dir, start_method="fork", redirects={ 0: Std.ERR, 1: Std.NONE }, tee={ 0: Std.OUT, 1: Std.ERR }, ) result = pc.wait() self.assertFalse(result.is_failed()) self.assert_in_file(["hello stdout from 0"], pc.stdouts[0]) self.assert_in_file(["hello stderr from 0"], pc.stderrs[0]) self.assert_in_file(["world stderr from 1"], pc.stderrs[1]) self.assertFalse(pc.stdouts[1]) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())