예제 #1
0
파일: launch.py 프로젝트: jenot15/elastic
def main(args=None):
    args = parse_args(args)

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(f"\n**************************************\n"
                 f"Rendezvous info:\n"
                 f"--rdzv_backend={args.rdzv_backend} "
                 f"--rdzv_endpoint={args.rdzv_endpoint} "
                 f"--rdzv_id={args.rdzv_id}\n"
                 f"**************************************\n")

    config, cmd = config_from_args(args)

    try:
        elastic_launch(
            config=config,
            entrypoint=cmd[0],
        )(*cmd[1:])
    finally:
        if args.standalone:
            etcd_server.stop()
예제 #2
0
 def elastic_launch_wrapper():
     """We need a wrapper function for class `elastic_launch.` in order to make multiprocess returns correct exit code."""
     elastic_launch(
         self.get_test_launch_config(min_nodes, max_nodes,
                                     nproc_per_node, run_id),
         sys.executable,
     )("-u", path("bin/test_script.py"),
       f"--touch_file_dir={self.test_dir}")
예제 #3
0
    def test_launch_elastic(self):
        nproc_per_node = 4

        elastic_launch(
            self.get_test_launch_config(1, 2, nproc_per_node),
            sys.executable,
        )("-u", path("bin/test_script.py"),
          f"--touch_file_dir={self.test_dir}")

        world_size = nproc_per_node
        self.check_works_ran(world_size)
예제 #4
0
    def test_launch_script_bash(self):
        nnodes = 1
        nproc_per_node = 4

        elastic_launch(
            self.get_test_launch_config(nnodes, nnodes, nproc_per_node),
            path("bin/test_script.sh"),
        )(f"{self.test_dir}")

        world_size = nnodes * nproc_per_node
        self.check_works_ran(world_size)
예제 #5
0
    def test_launch_script_python(self):
        nnodes = 1
        nproc_per_node = 4

        elastic_launch(
            self.get_test_launch_config(nnodes, nnodes, nproc_per_node),
            sys.executable,
        )("-u", path("bin/test_script.py"),
          f"--touch_file_dir={self.test_dir}")

        # make sure all the workers ran.
        # each worker touches a file with its global rank as the name.
        world_size = nnodes * nproc_per_node
        self.check_works_ran(world_size)
예제 #6
0
 def test_launch_elastic_agent_raise_exception(self, record_mock,
                                               mock_agent_run):
     """
     Asserts that when the agent raises an exception
     the launcher re-raises the original exception.
     """
     mock_agent_run.side_effect = MockException
     with self.assertRaises(MockException):
         elastic_launch(
             self.get_test_launch_config(1, 2, 4),
             sys.executable,
         )("-u", path("bin/test_script.py"),
           f"--touch_file_dir={self.test_dir}")
     record_mock.assert_called_once()
예제 #7
0
    def test_launch_elastic_worker_raise_exception(self, record_mock):
        """
        Asserts that when the worker program fails and lancher raieses exception
        to indicate that worker process failed.
        """
        nproc_per_node = 4

        with self.assertRaises(ChildFailedError):
            elastic_launch(
                self.get_test_launch_config(1, 2, nproc_per_node),
                sys.executable,
            )("-u", path("bin/test_script.py"), "--fail")

        record_mock.assert_called_once()
예제 #8
0
    def test_launch_shutdown(self, agent_mock_cls):
        agent_mock = Mock()
        agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED)
        agent_mock_cls.return_value = agent_mock
        rdzv_handler_mock = Mock()
        with patch(
                "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler"
        ) as param_mock:
            param_mock.return_value = rdzv_handler_mock
            elastic_launch(
                self.get_test_launch_config(1, 1, 4),
                sys.executable,
            )("-u", path("bin/test_script.py"),
              f"--touch_file_dir={self.test_dir}")

            rdzv_handler_mock.shutdown.assert_called_once()
예제 #9
0
    def test_launch_function(self):
        nnodes = 1
        nproc_per_node = 4

        res = elastic_launch(
            self.get_test_launch_config(nnodes, nnodes, nproc_per_node),
            simple_rank_scale,
        )()

        expected_res = [10, 11, 12, 13]
        actual_res = sorted(value for value in res.values())
        self.assertEqual(expected_res, actual_res)