def test_function_raise(self): """ run 2x copies of echo2, raise an exception on the first """ RAISE = True for start_method in self._start_methods: with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="echo", entrypoint=echo2, args={ 0: ("hello", RAISE), 1: ("world", ) }, envs={ 0: {}, 1: {} }, log_dir=log_dir, start_method=start_method, ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertEqual(1, len(results.failures)) self.assertFalse(results.return_values) failure = results.failures[0] error_file = failure.error_file error_file_data = failure.error_file_data self.assertEqual(1, failure.exitcode) self.assertEqual("<N/A>", failure.signal_name()) self.assertEqual(pc.pids()[0], failure.pid) self.assertEqual(os.path.join(log_dir, "0", "error.json"), error_file) self.assertEqual( int(error_file_data["message"]["extraInfo"] ["timestamp"]), int(failure.timestamp), ) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())
def test_binary_raises(self): pc = start_processes( name="echo", entrypoint=bin("echo2.py"), args={0: ("--raises", "true", "foo"), 1: ("bar",)}, envs={0: {"RANK": "0"}, 1: {"RANK": "1"}}, log_dir=self.log_dir(), ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertTrue(results.is_failed()) self.assertEqual(1, len(results.failures)) failure = results.failures[0] self.assertEqual(1, failure.exitcode) self.assertEqual("<NONE>", failure.error_file_data["message"]) self.assertEqual("<N/A>", failure.signal_name())
def test_function(self): for start_method, redirs in product(self._start_methods, redirects_all()): with self.subTest(start_method=start_method, redirs=redirs): pc = start_processes( name="echo", entrypoint=echo1, args={ 0: ("hello", ), 1: ("hello", ) }, envs={ 0: { "RANK": "0" }, 1: { "RANK": "1" } }, log_dir=self.log_dir(), start_method=start_method, redirects=redirs, ) results = pc.wait(period=0.1) nprocs = pc.nprocs self.assert_pids_noexist(pc.pids()) self.assertEqual({i: f"hello_{i}" for i in range(nprocs)}, results.return_values) for i in range(nprocs): if redirs & Std.OUT != Std.OUT: self.assertFalse(results.stdouts[i]) if redirs & Std.ERR != Std.ERR: self.assertFalse(results.stderrs[i]) if redirs & Std.OUT == Std.OUT: self.assert_in_file([f"hello stdout from {i}"], results.stdouts[i]) if redirs & Std.ERR == Std.ERR: self.assert_in_file([f"hello stderr from {i}"], results.stderrs[i])
def test_void_function(self): for start_method in self._start_methods: with self.subTest(start_method=start_method): pc = start_processes( name="echo", entrypoint=echo0, args={ 0: ("hello", ), 1: ("world", ) }, envs={ 0: {}, 1: {} }, log_dir=self.log_dir(), start_method=start_method, ) results = pc.wait(period=0.1) self.assertEqual({0: None, 1: None}, results.return_values)
def test_binary(self): for redirs in redirects_oss_test(): with self.subTest(redirs=redirs): pc = start_processes( name="echo", entrypoint=bin("echo1.py"), args={ 0: ("hello", ), 1: ("hello", ) }, envs={ 0: { "RANK": "0" }, 1: { "RANK": "1" } }, log_dir=self.log_dir(), redirects=redirs, ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) # currently binaries return {rank: None} self.assertEqual(2, len(results.return_values)) self.assertFalse(results.is_failed()) nprocs = pc.nprocs for i in range(nprocs): if redirs & Std.OUT != Std.OUT: self.assertFalse(results.stdouts[i]) if redirs & Std.ERR != Std.ERR: self.assertFalse(results.stderrs[i]) if redirs & Std.OUT == Std.OUT: self.assert_in_file([f"hello stdout from {i}"], results.stdouts[i]) if redirs & Std.ERR == Std.ERR: self.assert_in_file([f"hello stderr from {i}"], results.stderrs[i])
def test_binary_redirect_and_tee(self): pc = start_processes( name="trainer", entrypoint=bin("echo1.py"), args={0: ("hello",), 1: ("world",)}, envs={0: {"RANK": "0"}, 1: {"RANK": "1"}}, log_dir=self.log_dir(), start_method="fork", redirects={0: Std.ERR, 1: Std.NONE}, tee={0: Std.OUT, 1: Std.ERR}, ) result = pc.wait() self.assertFalse(result.is_failed()) self.assert_in_file(["hello stdout from 0"], pc.stdouts[0]) self.assert_in_file(["hello stderr from 0"], pc.stderrs[0]) self.assert_in_file(["world stderr from 1"], pc.stderrs[1]) self.assertFalse(pc.stdouts[1]) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())
def test_function_redirect_and_tee(self): for start_method in self._start_methods: with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="trainer", entrypoint=echo1, args={ 0: ("hello", ), 1: ("world", ) }, envs={ 0: { "RANK": "0" }, 1: { "RANK": "1" } }, log_dir=log_dir, start_method="spawn", redirects={ 0: Std.ERR, 1: Std.NONE }, tee={ 0: Std.OUT, 1: Std.ERR }, ) result = pc.wait() self.assertFalse(result.is_failed()) self.assert_in_file(["hello stdout from 0"], pc.stdouts[0]) self.assert_in_file(["hello stderr from 0"], pc.stderrs[0]) self.assert_in_file(["world stderr from 1"], pc.stderrs[1]) self.assertFalse(pc.stdouts[1]) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())
def test_function_large_ret_val(self): # python multiprocessing.queue module uses pipes and actually PipedQueues # This means that if a single object is greater than a pipe size # the writer process will block until reader process will start # reading the pipe. # This test makes a worker fn to return huge output, around ~10 MB size = 200000 for start_method in self._start_methods: with self.subTest(start_method=start_method): pc = start_processes( name="echo", entrypoint=echo_large, args={0: (size,), 1: (size,), 2: (size,), 3: (size,)}, envs={0: {}, 1: {}, 2: {}, 3: {}}, log_dir=self.log_dir(), start_method=start_method, ) results = pc.wait(period=0.1) for i in range(pc.nprocs): self.assertEqual(size, len(results.return_values[i]))
def test_function_signal(self): """ run 2x copies of echo3, induce a segfault on first """ SEGFAULT = True for start_method, redirs in product(self._start_methods, redirects()): with self.subTest(start_method=start_method): log_dir = self.log_dir() pc = start_processes( name="echo", entrypoint=echo3, args={ 0: ("hello", SEGFAULT), 1: ("world", ) }, envs={ 0: {}, 1: {} }, log_dir=log_dir, start_method=start_method, redirects=redirs, ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertEqual(1, len(results.failures)) self.assertFalse(results.return_values) failure = results.failures[0] error_file = failure.error_file self.assertEqual(-signal.SIGSEGV, failure.exitcode) self.assertEqual("SIGSEGV", failure.signal_name()) self.assertEqual(pc.pids()[0], failure.pid) self.assertEqual(os.path.join(log_dir, "0", "error.json"), error_file)
def test_binary_signal(self): pc = start_processes( name="echo", entrypoint=bin("echo3.py"), args={0: ("--segfault", "true", "foo"), 1: ("bar",)}, envs={0: {"RANK": "0"}, 1: {"RANK": "1"}}, log_dir=self.log_dir(), ) results = pc.wait(period=0.1) self.assert_pids_noexist(pc.pids()) self.assertTrue(results.is_failed()) self.assertEqual(1, len(results.failures)) failure = results.failures[0] self.assertNotEqual(signal.SIGSEGV, failure.exitcode) if TEST_WITH_ASAN: # ASAN exit code is 1. self.assertEqual("<N/A>", failure.signal_name()) else: self.assertEqual("SIGSEGV", failure.signal_name()) self.assertEqual("<NONE>", failure.error_file_data["message"])
def test_binary_exit(self): FAIL = 138 pc = start_processes( name="echo", entrypoint=bin("echo1.py"), args={ 0: ("--exitcode", FAIL, "foo"), 1: ("--exitcode", 0, "bar") }, envs={ 0: { "RANK": "0" }, 1: { "RANK": "1" } }, log_dir=self.log_dir(), redirects={0: Std.ALL}, ) results = pc.wait(period=0.1) self.assertTrue(results.is_failed()) self.assertEqual(1, len(results.failures)) failure = results.failures[0] self.assertEqual(138, failure.exitcode) self.assertEqual("<N/A>", failure.signal_name()) self.assertEqual("<NONE>", failure.error_file_data["message"]) self.assert_in_file([f"exit {FAIL} from 0"], results.stderrs[0]) self.assert_in_file([], results.stdouts[0]) self.assertFalse(results.stderrs[1]) self.assertFalse(results.stdouts[1]) self.assertTrue(pc._stderr_tail.stopped()) self.assertTrue(pc._stdout_tail.stopped())
def launch(self, args): cores = [] set_kmp_affinity = True if args.core_list: # user specify what cores will be used by params cores = [int(x) for x in args.core_list.split(",")] if args.ncores_per_instance == -1: raise RuntimeError( "please specify the \"--ncores_per_instance\" if you have pass the --core_list params" ) elif args.ninstances > 1 and args.ncores_per_instance * args.ninstances < len( cores): logger.warning( f"only first {args.ncores_per_instance * args.ninstances} cores will be used, \ but you specify {len(cores)} cores in core_list") else: args.ninstances = len(cores) // args.ncores_per_instance else: if args.use_logical_core: if args.node_id != -1: cores = self.cpuinfo.get_node_logical_cores(args.node_id) else: cores = self.cpuinfo.get_all_logical_cores() # When using all cores on all nodes, including logical cores, # setting KMP_AFFINITY disables logical cores. Thus, KMP_AFFINITY should not be set. set_kmp_affinity = False else: if args.node_id != -1: cores = self.cpuinfo.get_node_physical_cores(args.node_id) else: cores = self.cpuinfo.get_all_physical_cores() if not args.multi_instance and args.ninstances == -1 and args.ncores_per_instance == -1: args.ninstances = 1 args.ncores_per_instance = len(cores) elif args.multi_instance and args.ninstances == -1 and args.ncores_per_instance == -1: args.throughput_mode = True elif args.ncores_per_instance == -1 and args.ninstances != -1: if args.ninstances > len(cores): raise RuntimeError( f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \ please make sure ninstances <= total_cores)") else: args.ncores_per_instance = len(cores) // args.ninstances elif args.ncores_per_instance != -1 and args.ninstances == -1: if not args.skip_cross_node_cores: args.ninstances = len(cores) // args.ncores_per_instance else: ncore_per_node = len(self.cpuinfo.node_physical_cores[0]) num_leftover_cores = ncore_per_node % args.ncores_per_instance if args.ncores_per_instance > ncore_per_node: # too many ncores_per_instance to skip cross-node cores logger.warning( "there are {} core(s) per socket, but you specify {} ncores_per_instance and \ skip_cross_node_cores. Please make sure --ncores_per_instance < core(s) per \ socket".format(ncore_per_node, args.ncores_per_instance)) exit(-1) elif num_leftover_cores == 0: # aren't any cross-node cores logger.info( '--skip_cross_node_cores is set, but there are no cross-node cores.' ) args.ninstances = len( cores) // args.ncores_per_instance else: # skip cross-node cores if args.ninstances != -1: logger.warning( '--skip_cross_node_cores is exclusive to --ninstances. --ninstances \ won\'t take effect even if it is set explicitly.') i = 1 leftover_cores = set() while ncore_per_node * i <= len(cores): leftover_cores.update( cores[ncore_per_node * i - num_leftover_cores:ncore_per_node * i]) i += 1 cores = list(set(cores) - leftover_cores) assert len(cores) % args.ncores_per_instance == 0 args.ninstances = len( cores) // args.ncores_per_instance else: if args.ninstances * args.ncores_per_instance > len(cores): raise RuntimeError( "Please make sure ninstances * ncores_per_instance <= total_cores" ) if args.latency_mode: logger.warning( "--latency_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \ --use_logical_core. They won't take effect even they are set explicitly.") args.ncores_per_instance = 4 cores = self.cpuinfo.get_all_physical_cores() args.ninstances = len(cores) // args.ncores_per_instance if args.throughput_mode: logger.warning( "--throughput_mode is exclusive to --ninstances, --ncores_per_instance, --node_id and \ --use_logical_core. They won't take effect even they are set explicitly.") args.ninstances = self.cpuinfo.node_nums cores = self.cpuinfo.get_all_physical_cores() args.ncores_per_instance = len(cores) // args.ninstances if args.ninstances > 1 and args.rank != -1: logger.info( f"assigning {args.ncores_per_instance} cores for instance {args.rank}" ) self.set_multi_thread_and_allocator(args.ncores_per_instance, args.disable_iomp, set_kmp_affinity, args.enable_tcmalloc, args.enable_jemalloc, args.use_default_allocator) entrypoint = "" launch_args = {} launch_envs: Dict[int, Dict] = {} launch_tee = {} for i in range(args.ninstances): cmd = [] cur_process_cores = "" if not args.disable_numactl: cmd = ["numactl"] cores = sorted(cores) if args.rank == -1: # sequentially assign ncores_per_instance to ninstances core_list = cores[i * args.ncores_per_instance:(i + 1) * args.ncores_per_instance] else: # assign ncores_per_instance from rank core_list = cores[args.rank * args.ncores_per_instance:(args.rank + 1) * args.ncores_per_instance] core_ranges: List[Dict] = [] for core in core_list: if len(core_ranges) == 0: range_elem = {"start": core, "end": core} core_ranges.append(range_elem) else: if core - core_ranges[-1]["end"] == 1: core_ranges[-1]["end"] = core else: range_elem = {"start": core, "end": core} core_ranges.append(range_elem) for r in core_ranges: cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']}," cur_process_cores = cur_process_cores[:-1] numa_params = f"-C {cur_process_cores} " numa_ids = ",".join([ str(numa_id) for numa_id in self.cpuinfo.numa_aware_check(core_list) ]) numa_params += f"-m {numa_ids}" cmd.extend(numa_params.split()) with_python = not args.no_python if with_python: cmd.append(sys.executable) cmd.append("-u") if args.module: cmd.append("-m") cmd.append(args.program) cmd.extend(args.program_args) cmd_s = " ".join(cmd) logger.info(cmd_s) if entrypoint == "": entrypoint = cmd[0] del cmd[0] launch_args[i] = tuple(cmd) launch_envs[i] = {} launch_tee[i] = Std.ALL if args.rank != -1: # launches single instance, rank, only break ctx = start_processes(name=args.log_file_prefix, entrypoint=entrypoint, args=launch_args, envs=launch_envs, log_dir=args.log_path, tee=launch_tee) ctx.wait()
def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]: spec = worker_group.spec store = worker_group.store assert store is not None master_addr, master_port = super()._get_master_addr_port(store) restart_count = spec.max_restarts - self._remaining_restarts use_agent_store = spec.rdzv_handler.get_backend() == "static" args: Dict[int, Tuple] = {} envs: Dict[int, Dict[str, str]] = {} for worker in worker_group.workers: local_rank = worker.local_rank worker_env = { "LOCAL_RANK": str(local_rank), "RANK": str(worker.global_rank), "GROUP_RANK": str(worker_group.group_rank), "ROLE_RANK": str(worker.role_rank), "ROLE_NAME": spec.role, "LOCAL_WORLD_SIZE": str(spec.local_world_size), "WORLD_SIZE": str(worker.world_size), "GROUP_WORLD_SIZE": str(worker_group.group_world_size), "ROLE_WORLD_SIZE": str(worker.role_world_size), "MASTER_ADDR": master_addr, "MASTER_PORT": str(master_port), "TORCHELASTIC_RESTART_COUNT": str(restart_count), "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts), "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(), "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store), "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING", str(1)), } if "OMP_NUM_THREADS" in os.environ: worker_env["OMP_NUM_THREADS"] = os.environ["OMP_NUM_THREADS"] envs[local_rank] = worker_env worker_args = list(spec.args) worker_args = macros.substitute(worker_args, str(local_rank)) args[local_rank] = tuple(worker_args) # scaling events do not count towards restarts (gets same attempt #) # remove existing log dir if this restart is due to a scaling event attempt_log_dir = os.path.join(self._log_dir, f"attempt_{restart_count}") shutil.rmtree(attempt_log_dir, ignore_errors=True) os.makedirs(attempt_log_dir) assert spec.entrypoint is not None self._pcontext = start_processes( name=spec.role, entrypoint=spec.entrypoint, args=args, envs=envs, log_dir=attempt_log_dir, start_method=self._start_method, redirects=spec.redirects, tee=spec.tee, ) return self._pcontext.pids()