示例#1
0
    def _create_nccl_pg(self, name_prefix):
        tcp_store = create_tcp_store(jit_class=True)
        opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True)

        name = unique_process_group_name(name_prefix)

        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)
示例#2
0
            def __init__(self):
                super(TestModule, self).__init__()
                tcp_store = create_tcp_store(jit_class=True)

                name = unique_process_group_name("module_member_process_group")
                self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper(
                    1, 0, [], "nccl", tcp_store, name, 0)
示例#3
0
 def _multi_worker_helper(self, world_size):
     addr = DEFAULT_HOSTNAME
     server_store = create_tcp_store(addr, world_size, wait_for_workers=False)
     server_store.set("key", "value")
     port = server_store.port
     world_size = random.randint(5, 10) if world_size == -1 else world_size
     for i in range(world_size):
         self._create_client(i, addr, port, world_size)
示例#4
0
    def _multi_worker_helper(self, world_size):
        addr = DEFAULT_HOSTNAME
        server_store = create_tcp_store(addr, world_size, wait_for_workers=False)
        server_store.set("key", "value")
        port = server_store.port

        num_indices = world_size if world_size else 1
        for i in range(num_indices):
            self._create_client(i, addr, port, world_size)
示例#5
0
    def test_frontend_singleton(self):
        frontend1 = torch.classes.dist_c10d.frontend()
        frontend2 = torch.classes.dist_c10d.frontend()

        tcp_store = create_tcp_store(jit_class=True)

        pg_name = unique_process_group_name("singleton_test_process_group")

        ProcessGroupNCCL1 = frontend1.new_process_group_helper(
            self.world_size, self.rank, [], "nccl", tcp_store, pg_name, 0)

        ProcessGroupNCCL2 = frontend2.get_process_group_by_name(pg_name)
        self.assertEqual(frontend2.get_name_of_process_group(ProcessGroupNCCL2), pg_name)
示例#6
0
 def _multi_worker_helper(self, world_size):
     addr = DEFAULT_HOSTNAME
     server_store = create_tcp_store(addr, world_size, wait_for_workers=False)
     server_store.set("key", "value")
     port = server_store.port
     messages = mp.Queue()
     processes = []
     num_proccesses = random.randint(3, 5) if world_size == -1 else world_size
     for i in range(num_proccesses):
         p = mp.Process(target=self._create_client, args=(i, addr, port, world_size, messages))
         processes.append(p)
         p.start()
     for p in processes:
         p.join()
     error_message = ""
     while not messages.empty():
         error_message += messages.get() + "\n"
     if any([p.exitcode != 0 for p in processes]):
         raise RuntimeError(error_message)
示例#7
0
    def _create_nccl_pg_as_base_process_group(self, name):
        tcp_store = create_tcp_store(jit_class=True)

        return torch.classes.dist_c10d.frontend().new_process_group_helper(
            self.world_size, self.rank, [], "nccl", tcp_store, name, 0)
示例#8
0
 def setUp(self):
     super(PrefixTCPStoreTest, self).setUp()
     self.tcpstore = create_tcp_store()
     self.prefix = "test_prefix"
     self.tcpstore.set_timeout(timedelta(seconds=300))
示例#9
0
 def _create_store(self):
     store = create_tcp_store()
     store.set_timeout(timedelta(seconds=300))
     return store