Exemplo n.º 1
0
def _create_c10d_store(hostname, port, rank, world_size, timeout) -> Store:
    """
    Smartly creates a c10d Store object on ``rank`` based on whether
    we need to re-use agent store. The TCPStore server is assumed to be hosted
    on ``hostname:port``.

    If ``torchelastic_use_agent_store()`` is ``True``, then it is assumed that
    the agent leader (node rank 0) hosts the TCPStore server (for which the
    endpoint is specified by the given ``hostname:port``). Hence
    ALL ranks will create and return a TCPStore client (e.g. ``start_daemon=False``).

    If ``torchelastic_use_agent_store()`` is ``False``, then rank 0 will host
    the TCPStore (with multi-tenancy) and it is assumed that rank 0's hostname
    and port are correctly passed via ``hostname`` and ``port``. All
    non-zero ranks will create and return a TCPStore client.
    """
    # check if port is uint16_t
    if not 0 <= port < 2**16:
        raise ValueError(
            f"port must have value from 0 to 65535 but was {port}.")

    if _torchelastic_use_agent_store():
        attempt = os.environ["TORCHELASTIC_RESTART_COUNT"]
        tcp_store = TCPStore(hostname, port, world_size, False, timeout)
        return PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
    else:
        start_daemon = rank == 0
        return TCPStore(hostname,
                        port,
                        world_size,
                        start_daemon,
                        timeout,
                        multi_tenant=True)
Exemplo n.º 2
0
def setup_tcpstore(rank, world_size, rdzv_version, rdzv_impl):
    if rank == 0:
        import socket
        from contextlib import closing

        # FIXME: ideally, TCPStore should have an API that
        # accepts a pre-constructed socket.
        with closing(_get_socket_with_port()) as sock:
            host = socket.gethostname()
            port = sock.getsockname()[1]

            rdzv_impl.store_extra_data(rdzv_version,
                                       key="tcpstore_server",
                                       value="{}:{}".format(host, port))

            log.info(f"Setting up TCPStore server on {host}:{port}")
            start_daemon = True
            sock.close(
            )  # FIXME: get rid of race-condition by improving TCPStore API
            store = TCPStore(host, port, world_size, start_daemon)
            log.info(f"TCPStore server initialized on {host}:{port}")
    else:
        hostport = rdzv_impl.load_extra_data(rdzv_version,
                                             key="tcpstore_server")
        log.info(f"Rank {rank} will conenct to TCPStore server at {hostport}")

        import re

        host, port = re.match(r"(.+):(\d+)$", hostport).groups()
        start_daemon = False
        store = TCPStore(host, int(port), world_size, start_daemon)

    return store
Exemplo n.º 3
0
def _env_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs):
    def _error(msg):
        return _rendezvous_error("env:// rendezvous: " + msg)

    def _env_error(var):
        return _error("environment variable %s expected, but not set" % var)

    def _get_env_or_raise(env_var: str) -> str:
        env_val = os.environ.get(env_var, None)
        if not env_val:
            raise _env_error(env_var)
        else:
            return env_val

    result = urlparse(url)
    query: Dict[str, Union[int, str]]
    # mypy doesn't allow dict() to accept List of values (#257)
    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))  # type: ignore[misc, arg-type]

    rank: Optional[Union[str, int]]
    world_size: Optional[Union[str, int]]
    master_port: Optional[Union[str, int]]

    if "rank" in query:
        rank = int(query["rank"])
    else:
        rank = int(_get_env_or_raise("RANK"))

    if "world_size" in query:
        world_size = int(query["world_size"])
    else:
        world_size = int(_get_env_or_raise("WORLD_SIZE"))

    master_addr = _get_env_or_raise("MASTER_ADDR")
    master_port = int(_get_env_or_raise("MASTER_PORT"))


    use_torchelastic_store = os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None)

    if use_torchelastic_store == str(True):
        worker_process_prefix = "/worker"
        # When TORCHELASTIC_USE_AGENT_STORE is set up, the worker process is assumed
        # to be invoked by the torchelastic agent. Torchelastic agent creates a tcp daemon thread
        # on the GROUP_RANK=0, as a result all user worker processes should create store with: daemon=False
        tcp_store = TCPStore(master_addr, master_port, world_size, False, timeout)
        # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191
        yield (PrefixStore(worker_process_prefix, tcp_store), rank, world_size)
    else:
        # Start the TCP store daemon on the rank 0
        start_daemon = rank == 0
        store = TCPStore(  # type: ignore[call-arg]
            master_addr, master_port, world_size, start_daemon, timeout, multi_tenant=True
        )
        # Each if-else condition returns due to: https://github.com/python/mypy/issues/1191
        yield (store, rank, world_size)

    # If this configuration is invalidated, there is nothing we can do about it
    raise RuntimeError("Unable to perform rerendezvous using env:// method")
Exemplo n.º 4
0
def _tcp_rendezvous_handler(url: str, timeout: timedelta = default_pg_timeout, **kwargs):
    def _error(msg):
        return _rendezvous_error("tcp:// rendezvous: " + msg)

    result = urlparse(url)
    if not result.port:
        raise _error("port number missing")
    query: Dict[str, Union[int, str]]
    # mypy doesn't allow dict() to accept List of values (#257)
    query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))  # type: ignore[misc, arg-type]
    if "rank" not in query:
        raise _error("rank parameter missing")
    if "world_size" not in query:
        raise _error("world size parameter missing")

    rank = int(query["rank"])
    world_size = int(query["world_size"])
    start_daemon = rank == 0
    assert result.hostname is not None
    store = TCPStore(  # type: ignore[call-arg]
        result.hostname, result.port, world_size, start_daemon, timeout, multi_tenant=True
    )
    yield (store, rank, world_size)

    # If this configuration is invalidated, there is nothing we can do about it
    raise RuntimeError("Unable to perform rerendezvous using tcp:// method")
    def test_create_backend_returns_backend_if_is_host_is_false(self) -> None:
        store = TCPStore(  # type: ignore[call-arg] # noqa: F841
            self._expected_endpoint_host, self._expected_endpoint_port, is_master=True
        )

        self._params.config["is_host"] = "false"

        self.test_create_backend_returns_backend()
Exemplo n.º 6
0
    def test_store_methods_forward_calls_to_inner(self):
        inner = TCPStore("127.0.0.1", 0, is_master=True)

        store = _ClosableStore(inner)

        store.set("dummy", "dummy")

        store.close()
    def test_create_backend_returns_backend_if_is_host_is_not_specified_and_store_already_exists(
        self,
    ) -> None:
        store = TCPStore(  # type: ignore[call-arg] # noqa: F841
            self._expected_endpoint_host, self._expected_endpoint_port, is_master=True
        )

        del self._params.config["is_host"]

        self.test_create_backend_returns_backend()
Exemplo n.º 8
0
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.

        Return:
            True if distributed library is initialized successfully.
        """
        if self._args.distributed_impl:
            logger.info(
                'Distributed training is enabled - model: {}, distributed implementation: {}.'
                .format(self._name, self._args.distributed_impl))
            if self._args.distributed_impl == DistributedImpl.HOROVOD:
                import horovod.torch as hvd
                hvd.init()
                self._world_size = int(hvd.size())
                self._local_rank = int(hvd.local_rank())
                self._global_rank = int(hvd.rank())
            elif self._args.distributed_impl == DistributedImpl.DDP:
                if os.environ.get('WORLD_SIZE') is None or os.environ.get(
                        'LOCAL_RANK') is None:
                    logger.error(
                        'Can not find WORLD_SIZE or LOCAL_RANK in env variables - model: {},'
                        ' distributed implementation: {}.'.format(
                            self._name, self._args.distributed_impl))
                    return False
                # torch >= 1.9.0a0 torch.distributed.elastic is used by default
                port = int(os.environ['MASTER_PORT']) + 1
                addr = os.environ['MASTER_ADDR']
                self._global_rank = int(os.environ['RANK'])
                self._local_rank = int(os.environ['LOCAL_RANK'])
                self._world_size = int(os.environ['WORLD_SIZE'])
                logger.debug('ip:{},port:{},rank:{},world:{}'.format(
                    addr, port, self._global_rank, self._world_size))
                store = PrefixStore(
                    self._name,
                    TCPStore(addr, port, self._world_size,
                             self._global_rank == 0, timedelta(seconds=300)))
                torch.distributed.init_process_group(
                    backend=self._args.distributed_backend.value,
                    timeout=timedelta(seconds=300),
                    rank=self._global_rank,
                    world_size=self._world_size,
                    store=store)

            else:
                logger.error(
                    'Unsupported distributed implementation - model: {}, distributed implementation: {}.'
                    .format(self._name, self._args.distributed_impl))
                return False

            if self._gpu_available:
                torch.cuda.set_device(self._local_rank)

        return True
Exemplo n.º 9
0
    def test_store_methods_raise_error_if_store_is_closed(self):
        inner = TCPStore("127.0.0.1", 0, is_master=True)

        store = _ClosableStore(inner)

        store.set("dummy", "dummy")

        store.close()

        with self.assertRaisesRegex(RuntimeError,
                                    r"^The store is already closed.$"):
            store.set("dummy", "dummy")
Exemplo n.º 10
0
 def next_rendezvous(self) -> Tuple[Store, int, int]:
     log.info("Creating TCPStore as the c10d::Store implementation")
     if not self._store:
         is_master = self.rank == 0
         self._store = TCPStore(
             self.master_addr,
             self.master_port,
             self.world_size,
             is_master,
             self.timeout,
         )
     store = PrefixStore(self.run_id, self._store)
     return store, self.rank, self.world_size
Exemplo n.º 11
0
def _create_tcp_store(params: RendezvousParameters) -> TCPStore:
    host, port = _parse_rendezvous_endpoint(params.endpoint,
                                            default_port=29500)

    cfg_is_host = params.get_as_bool("is_host")
    # If the user has explicitly specified whether our process should host the
    # the store, respect it.
    if cfg_is_host is not None:
        is_host = cfg_is_host
    # Otherwise try to determine whether we are the host based on our hostname
    # and IP address.
    else:
        is_host = _matches_machine_hostname(host)

    # The timeout
    read_timeout = cast(int, params.get_as_int("read_timeout", 60))
    if read_timeout <= 0:
        raise ValueError("The read timeout must be a positive integer.")

    # In specific cases we attempt to instantiate the store twice. For details
    # see the explanation in the except clause below.
    for _ in range(2):
        try:
            store = TCPStore(  # type: ignore[call-arg]
                host,
                port,
                is_master=is_host,
                timeout=timedelta(seconds=read_timeout))

            if is_host:
                log.info(
                    f"Process {os.getpid()} hosts the TCP store for the C10d rendezvous backend."
                )

            break
        except (ValueError, RuntimeError) as exc:
            # If we heuristically inferred the value of is_host as True and our
            # first attempt to instantiate the TCP store has failed, try it one
            # more time with is_host set to False. As an edge case there can be
            # more than one process that is part of the same rendezvous on this
            # machine and only one of them will eventually host the store.

            if not is_host or cfg_is_host is not None:
                raise RendezvousConnectionError(
                    "The connection to the C10d store has failed. See inner exception for details."
                ) from exc

            is_host = False

    return store
Exemplo n.º 12
0
 def setUpClass(cls) -> None:
     cls._store = TCPStore("localhost", 0, is_master=True)  # type: ignore[call-arg]
 def setUpClass(cls) -> None:
     cls._store = TCPStore("127.0.0.1", 0, is_master=True)  # type: ignore[call-arg]