Пример #1
0
    num_min_workers = int(params["min_workers"])
    num_max_workers = int(params.get("max_workers", num_min_workers))
    assert num_min_workers >= 1, "Min number of workers should be at least 1"
    assert (
        num_max_workers >= num_min_workers
    ), "Max number of workers cannot be less than min number of workers"

    timeout = int(params.get("timeout", CONST_DEFAULT_OVERALL_TIMEOUT))
    last_call_timeout = int(
        params.get("last_call_timeout", CONST_DEFAULT_LAST_CALL_TIMEOUT)
    )

    kwargs = _parse_etcd_client_params(params)

    # Etcd rendezvous implementation
    etcd_rdzv = CustomRendezvous(
        endpoints=etcd_endpoints,
        prefix=etcd_prefix,
        run_id=run_id,
        num_min_workers=num_min_workers,
        num_max_workers=num_max_workers,
        timeout=timeout,
        last_call_timeout=last_call_timeout,
        **kwargs,
    )
    return CustomRendezvousHandler(rdzv_impl=etcd_rdzv)


# torchelastic.rendezvous.RendezvousHandler using etcd (API v2):
register_rendezvous_handler("custom", _custom_rendezvous_handler)
Пример #2
0
    etcd_prefix = params.get("etcd_prefix", "/torchelastic/p2p")
    num_min_workers = int(params["min_workers"])
    num_max_workers = int(params.get("max_workers", num_min_workers))
    assert num_min_workers >= 1, "Min number of workers should be at least 1"
    assert (
        num_max_workers >= num_min_workers
    ), "Max number of workers cannot be less than min number of workers"

    timeout = int(params.get("timeout", CONST_DEFAULT_OVERALL_TIMEOUT))
    last_call_timeout = int(
        params.get("last_call_timeout", CONST_DEFAULT_LAST_CALL_TIMEOUT))

    kwargs = _parse_etcd_client_params(params)

    # Etcd rendezvous implementation
    etcd_rdzv = EtcdRendezvous(
        endpoints=etcd_endpoints,
        prefix=etcd_prefix,
        run_id=run_id,
        num_min_workers=num_min_workers,
        num_max_workers=num_max_workers,
        timeout=timeout,
        last_call_timeout=last_call_timeout,
        kwargs=kwargs,
    )
    return EtcdRendezvousHandler(rdzv_impl=etcd_rdzv)


# torchelastic.rendezvous.RendezvousHandler using etcd (API v2):
register_rendezvous_handler("etcd", _etcd_rendezvous_handler)
Пример #3
0
import torchelastic.rendezvous.parameters as parameters

# pyre-fixme[21]: Could not find name `register_rendezvous_handler` in
#  `torch.distributed`.
from torch.distributed import register_rendezvous_handler


class MockedRdzv(object):
    pass


def get_mock_rdzv(url):
    return MockedRdzv()


register_rendezvous_handler("mocked-rdzv", get_mock_rdzv)


class RendezvousHandlerFactoryTest(unittest.TestCase):
    def test_construct_rdzv_url(self):
        params = parameters.RendezvousParameters(
            "etcd",
            "localhost:8081",
            "1234",
            1,
            4,
            "timeout=60,protocol=https,key=/etc/kubernetes/certs/client.key",
        )
        actual_url = parameters._construct_rendezvous_url(params)

        expected_url = ("etcd://localhost:8081/1234"