def test_rl_rpc(self):
        if self.rank == 0:
            # Rank 0 is the agent.
            rpc.init_rpc(
                name=worker_name(self.rank),
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )
            agent = Agent(self.world_size)
            run_agent(agent,
                      n_steps=int(TOTAL_EPISODE_STEP / (self.world_size - 1)))

            # Ensure training was run. We don't really care about whether the task was learned,
            # since the purpose of the test is to check the API calls.
            self.assertGreater(agent.running_reward, 0.0)
        else:
            # Other ranks are observers that passively wait for instructions from the agent.
            rpc.init_rpc(
                name=worker_name(self.rank),
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )
        rpc.shutdown()
def run(rank, world_size):
    print(f'rank = {rank} world_size = {world_size}')
    env_dict = {key: os.environ[key] for key in key_list}

    print(env_dict)

    if rank == 0:
        rpc.init_rpc(f"rank{rank}",
                     rank=rank,
                     world_size=world_size,
                     backend=rpc.BackendType.TENSORPIPE)

        coordinator = Coordinator(world_size=world_size, lr=1e-3)
        coord_rref = RRef(coordinator)
        coordinator.run_training_loop(1000, coord_rref)

        torch.save(coordinator.policy,
                   open(f'plots/policy_nworkers{world_size-1}.pt', 'wb'))
        json.dump(coordinator.stats,
                  open(f'plots/stats_nworkers{world_size-1}.json', 'w'))

    else:
        rpc.init_rpc(f"rank{rank}",
                     rank=rank,
                     world_size=world_size,
                     backend=rpc.BackendType.TENSORPIPE)

    rpc.shutdown()
Пример #3
0
def run_worker(rank, world_size) -> None:
    r"""
  This is the entry point for all processes. The rank 0 is the agent. All
  other ranks are observers.
  """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "29500"
    if rank == 0:  # rank0 is the agent

        rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)

        agent = Agent(world_size)
        for i_episode in count(1):
            n_steps = int(TOTAL_EPISODE_STEP / (config.world_size - 1))
            agent.run_episode(n_steps=n_steps)
            last_reward = agent.finish_episode()

            if i_episode % config.log_interval == 0:
                print(
                    f"Episode {i_episode}\tLast reward: {last_reward:.2f}\tAverage reward: {agent.running_reward:.2f}"
                )

            if agent.running_reward > agent.reward_threshold:
                print(f"Solved! Running reward is now {agent.running_reward}!")
                break
    else:  # other ranks are the observer
        rpc.init_rpc(
            OBSERVER_NAME.format(rank), rank=rank, world_size=world_size
        )  # observers passively waiting for instructions from agents

    rpc.shutdown()
Пример #4
0
    def new_test_method(self, *arg, **kwargs):
        # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted
        # in tests.
        import torch.distributed.rpc.api as api
        api._ignore_rref_leak = False

        self.worker_id = self.rank

        if faulty_messages:
            _build_faulty_backend_options(faulty_messages)

        if setup_rpc:
            rpc.init_rpc(
                name="worker%d" % self.rank,
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )

        return_value = old_test_method(self, *arg, **kwargs)

        if setup_rpc:
            rpc.shutdown(graceful=clean_shutdown)

        return return_value
Пример #5
0
def bench_mpi(args):
    guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
    os.environ["UCX_NET_DEVICES"] = best_device_map[guess_rank]

    torch.distributed.init_process_group(backend="mpi")
    os.environ["MASTER_ADDR"] = args.host
    os.environ["MASTER_PORT"] = "10639"
    if args.socket_name:
        os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name
        os.environ["TP_SOCKET_IFNAME"] = args.socket_name
    init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}"
    rank = torch.distributed.get_rank()
    world_size = torch.distributed.get_world_size()
    torch.cuda.set_device(rank % torch.cuda.device_count())

    rpc.init_rpc(
        f"Test{rank}",
        rank=rank,
        world_size=world_size,
        backend=rpc.BackendType.PROCESS_GROUP,
        rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
            rpc_timeout=20, init_method=init_method),
    )

    initialize_model_parallel(1, world_size)
    init_random_seed(0)

    run_mp_worker(args, world_size)

    rpc.shutdown()
    torch.distributed.destroy_process_group()
Пример #6
0
def run_worker(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256,
                                              rpc_timeout=600)

    import psutil
    p = psutil.Process()

    if rank == 0:
        p.cpu_affinity([0])
        print(
            f"Child #{rank}: Set my affinity to {rank}, affinity now {p.cpu_affinity()}",
            flush=True)

        rpc.init_rpc("master",
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=options)
        run_master()
    else:
        p.cpu_affinity([rank - 1])
        print(
            f"Child #{rank}: Set my affinity to {rank}, affinity now {p.cpu_affinity()}",
            flush=True)

        rpc.init_rpc(f"worker{rank}",
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=options)
        pass

    # block until all rpcs finish
    rpc.shutdown()
Пример #7
0
def basic_rpc():
    init_rpc()
    if torch.distributed.get_rank() != 0:
        rpc.shutdown()
        torch.distributed.barrier()
        return

    model = [nn.Linear(10, 10), nn.ReLU()]
    pipe = PipeRPCWrapper(model, [1, 1],
                          input_device=torch.cuda.current_device(),
                          worker_map=get_worker_map())

    pipe.foreach_worker(register_optimizer, include_self=True)

    inputs = torch.rand(10).cuda()
    output = pipe(inputs)
    loss = output.mean()
    loss.backward()

    pipe.foreach_worker(step_optimizer, include_self=True)

    pipe.eval()

    rpc.shutdown()
    torch.distributed.barrier()
Пример #8
0
def run_benchmark(rank, model, data, config):

    world_size = config.trainer_count + config.ps_count + 1
    os.environ['MASTER_ADDR'] = config.master_addr
    os.environ['MASTER_PORT'] = config.master_port
    rpc_backend_options = TensorPipeRpcBackendOptions()
    rpc_backend_options.init_method = config.rpc_init_method
    if rank == world_size - 1:
        # master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count]
        run_master(rank, model, data, config, rpc_backend_options)
    elif rank >= config.trainer_count:
        # parameter_servers = [trainer_count, trainer_count + parameter_server_count)
        rpc.init_rpc(get_name(rank, config),
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    else:
        # trainers = [0, trainer_count)
        trainer_config = config.trainer_config
        ps_config = config.ps_config
        if (USE_CUDA_RPC in trainer_config and trainer_config[USE_CUDA_RPC]
                and USE_CUDA_RPC in ps_config and ps_config[USE_CUDA_RPC]
                and config.ps_count > 0):
            ps_rank = get_parameter_server_rank(rank, config)
            ps_name = get_name(ps_rank, config)
            rpc_backend_options.set_device_map(ps_name, {rank: ps_rank})
        trainer_name = get_name(rank, config)
        rpc.init_rpc(trainer_name,
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=rpc_backend_options)
    rpc.shutdown()
Пример #9
0
def run_worker(rank, world_size, n_episode, batch, print_log=True):
    r"""
    This is the entry point for all processes. The rank 0 is the agent. All
    other ranks are observers.
    """
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    if rank == 0:
        # rank0 is the agent
        rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)

        agent = Agent(world_size, batch)
        for i_episode in range(n_episode):
            last_reward, running_reward = agent.run_episode(n_steps=NUM_STEPS)

            if print_log:
                print(
                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.
                    format(i_episode, last_reward, running_reward))
    else:
        # other ranks are the observer
        rpc.init_rpc(OBSERVER_NAME.format(rank),
                     rank=rank,
                     world_size=world_size)
        # observers passively waiting for instructions from agents
    rpc.shutdown()
def rpc_worker(rank, world_size, init_file, func, *args):
    if torch_version() == (1, 8, 0):
        if torch.cuda.is_available():
            # Workaround for https://github.com/pytorch/pytorch/issues/53844
            options = rpc.TensorPipeRpcBackendOptions(
                init_method="file://" + init_file, _transports=["ibv", "uv"])
        else:
            # Workaround for https://github.com/pytorch/pytorch/issues/54266
            options = rpc.TensorPipeRpcBackendOptions(
                init_method="file://" + init_file,
                _channels=[
                    "mpt_uv", "basic", "cuda_ipc", "cuda_gdr", "cuda_xth",
                    "cuda_basic"
                ],
            )
    else:
        options = rpc.TensorPipeRpcBackendOptions(init_method="file://" +
                                                  init_file)
    rpc.init_rpc(
        "worker" + str(rank),
        rank=rank,
        world_size=world_size,
        backend=rpc.BackendType.TENSORPIPE,
        rpc_backend_options=options,
    )
    if rank == 0:
        func(*args)
    rpc.shutdown()
Пример #11
0
    def new_test_method(self, *arg, **kwargs):
        # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted
        # in tests.
        import torch.distributed.rpc.api as api
        api._ignore_rref_leak = False

        self.worker_id = self.rank

        if setup_rpc:
            global _ALL_NODE_NAMES
            _ALL_NODE_NAMES = {
                "worker{}".format(rank)
                for rank in range(self.world_size)
            }

            rpc.init_rpc(
                name="worker%d" % self.rank,
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )

        return_value = old_test_method(self, *arg, **kwargs)

        if setup_rpc:
            rpc.shutdown(graceful=clean_shutdown)

        return return_value
Пример #12
0
def run_rpc(rank):
    argv = {
        'world_size': int(2),
        'rank': int(0),
        'host': "worker0",
        'worker': "worker1",
        'epochs': int(5),
        'lr': float(1e-3),
        'root': 'data',
        'batch_size': int(32)
    }

    if (rank == 0):
        print(argv)
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '29502'
        rpc.init_rpc(argv['host'], rank=rank, world_size=argv['world_size'])
        print('Start Run:', rank)
        run(argv)
        rpc.shutdown()
    else:
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '29502'
        rpc.init_rpc(argv['worker'], rank=rank, world_size=argv['world_size'])
        print('Start Run:', rank)
        rpc.shutdown()
Пример #13
0
def start(args):
    rpc.init_rpc(args.name,
                 rank=args.rank,
                 world_size=args.world_size,
                 num_worker_threads=8)

    rpc.shutdown()
Пример #14
0
def run_worker(rank, world_size, num_split):
    os.environ['MASTER_ADDR'] = '172.31.13.136'
    # os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    # options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=32)

    if rank == 0:
        rpc.init_rpc(
            "master",
            rank=rank,
            world_size=world_size,
            # rpc_backend_options=options
        )
        run_master(num_split)

    else:
        rpc.init_rpc(
            f"worker{rank}",
            rank=rank,
            world_size=world_size,
            # rpc_backend_options=options
        )
        print("slave init")
        pass

    # block until all rpcs finish
    rpc.shutdown()
Пример #15
0
    def test_reinit(self):
        rpc.init_rpc(
            name="worker{}".format(self.rank),
            backend=self.rpc_backend,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=self.rpc_backend_options,
        )

        # This is for the below `dist.barrier`.
        # For `RpcAgent` other than `ProcessGroupAgent`,
        # no `_default_pg` is initialized.
        if not dist.is_initialized():
            dist.init_process_group(
                backend="gloo",
                init_method=self.init_method,
                rank=self.rank,
                world_size=self.world_size,
            )
        # Wait for all init to complete.
        dist.barrier()

        with self.assertRaisesRegex(RuntimeError, "is already initialized"):
            rpc.init_rpc(
                name="worker{}".format(self.rank),
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )
        rpc.shutdown()
Пример #16
0
def run_driver(rank, world_size, gpu_list, dataset, batch_size,
               lr, mom, lambd, max_epoch, client_epoch, model, seed, q,
               early_stop_round, early_stop_metric):
    exp_id = str(int(time.time()))
    print(f"Driver initializing RPC, rank {rank}, world size {world_size}")
    rpc.init_rpc(name="driver", rank=rank, world_size=world_size)
    print("Initialized driver")
    param_server_rref = rpc.remote("parameter_server", get_parameter_server,
                                   args=(gpu_list[0], world_size - 1, dataset, batch_size,
                                         lr, mom, lambd, model, max_epoch, client_epoch,
                                         seed, exp_id, early_stop_round, early_stop_metric))
    for _rank in range(1, world_size - 1):
        print(f"Driver registering worker node {_rank}")
        worker_server_rref = rpc.remote(f"trainer_{_rank}", get_worker,
                                        args=(gpu_list[_rank], _rank, world_size - 1, dataset,
                                              model, batch_size, lr, seed, exp_id))
        print(f"Driver binding worker {_rank} with param server")
        remote_method(ParameterServer.embedding_workers, param_server_rref, worker_server_rref)
        remote_method(TrainerNet.embedding_param_server, worker_server_rref, param_server_rref)

    fut = remote_method_async(ParameterServer.instruct_training, param_server_rref)
    fut.wait()
    final_accuracy = remote_method(ParameterServer.get_final_accuract, param_server_rref)
    q.put(final_accuracy)
    rpc.shutdown()
    print("RPC shutdown on Driver")
Пример #17
0
    def test_shutdown(self):
        # Initialize RPC.
        rpc.init_rpc(
            name="worker%d" % self.rank,
            backend=self.rpc_backend,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=self.rpc_backend_options,
        )

        n = self.rank + 1
        dst_rank = n % self.world_size
        ret = rpc.rpc_sync(
            "worker{}".format(dst_rank),
            torch.add,
            args=(torch.ones(n, n), torch.ones(n, n)),
        )
        self.assertEqual(ret, torch.ones(n, n) * 2)
        rpc.shutdown()

        with self.assertRaisesRegex(RuntimeError,
                                    "^RPC has not been initialized"):
            rpc.rpc_sync(
                "worker{}".format(dst_rank),
                torch.add,
                args=(torch.ones(n, n), torch.ones(n, n)),
            )

        # it's safe to call shutdown() multiple times
        rpc.shutdown()
Пример #18
0
def run_worker(rank, world_size, num_split):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256, rpc_timeout=600)

    import psutil
    p = psutil.Process()
    
    if rank == 0:
        p.cpu_affinity([rank])
        rpc.init_rpc(
            "master",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=options
        )
        run_master(num_split)
    else:
        p.cpu_affinity([rank])
        rpc.init_rpc(
            f"worker{rank}",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=options
        )
        pass

    # block until all rpcs finish
    rpc.shutdown()
Пример #19
0
def rpc_optimizer():

    init_rpc()
    if torch.distributed.get_rank() != 0:
        rpc.shutdown()
        torch.distributed.barrier()
        return

    def model_with_reuse():
        reused_1 = nn.Linear(10, 10)
        return [reused_1, nn.ReLU(), reused_1, nn.ReLU(), reused_1, nn.ReLU()]

    check_pipe_against_reference(
        [2, 2, 2],
        lambda: [
            nn.Linear(10, 10),
            nn.ReLU(),
            nn.Linear(10, 10),
            nn.ReLU(),
            nn.Linear(10, 10),
            nn.ReLU()
        ],
    )
    check_pipe_against_reference([2, 1, 1], model_with_reuse)

    rpc.shutdown()
    torch.distributed.barrier()
Пример #20
0
 def test_local_shutdown_with_rpc(self):
     # test that we can start RPC, send RPCs, and then run local shutdown.
     rpc.init_rpc(
         name="worker%d" % self.rank,
         backend=rpc.backend_registry.BackendType[
             dist_utils.TEST_CONFIG.rpc_backend_name
         ],
         rank=self.rank,
         world_size=self.world_size,
         rpc_backend_options=self.rpc_backend_options,
     )
     n = self.rank + 1
     dst_rank = n % self.world_size
     rpc.rpc_sync(
         "worker{}".format(dst_rank),
         torch.add,
         args=(torch.ones(n, n), torch.ones(n, n)),
     )
     # A barrier is needed to ensure that all RPCs are processed.
     # Otherwise, some RPCs can timeout since the receiving end
     # has terminated.
     if not dist.is_initialized():
         dist.init_process_group(
             backend="gloo",
             init_method=self.init_method,
             rank=self.rank,
             world_size=self.world_size,
         )
     dist.barrier()
     # pass in graceful=False to ensure that we don't wait for other workers.
     rpc.shutdown(graceful=False)
Пример #21
0
    def test_init_pg_and_rpc_with_same_socket(self):
        addr = DEFAULT_HOSTNAME
        port = common.find_free_port()

        os.environ["MASTER_ADDR"] = addr
        os.environ["MASTER_PORT"] = str(port)

        # We internally use a multi-tenant TCP store. Both PG and RPC should successfully
        # initialize even when using the same socket address.

        dist.init_process_group(
            backend="gloo",
            init_method="env://",
            rank=0,
            world_size=1,
        )

        backend_opts = rpc.TensorPipeRpcBackendOptions(
            init_method=f"tcp://{addr}:{port}")
        rpc.init_rpc(
            name="worker0",
            rank=0,
            world_size=1,
            rpc_backend_options=backend_opts,
        )

        rpc.shutdown()
Пример #22
0
    def new_test_method(self, *arg, **kwargs):
        # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted
        # in tests.
        import torch.distributed.rpc.api as api
        api._ignore_rref_leak = False

        self.worker_id = self.rank

        if (rpc.backend_registry.backend_registered("FAULTY_PROCESS_GROUP")
                and self.rpc_backend
                == rpc.backend_registry.BackendType.FAULTY_PROCESS_GROUP):
            _build_faulty_backend_options(self, faulty_messages,
                                          messages_to_delay)

        if (rpc.backend_registry.backend_registered("TENSORPIPE")
                and self.rpc_backend
                == rpc.backend_registry.BackendType.TENSORPIPE):
            TEST_CONFIG.rpc_backend_name = "TENSORPIPE"
            _build_tensorpipe_backend_options()

        if setup_rpc:
            rpc.init_rpc(
                name="worker%d" % self.rank,
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=self.rpc_backend_options,
            )

        return_value = old_test_method(self, *arg, **kwargs)

        if setup_rpc:
            rpc.shutdown(graceful=clean_shutdown)

        return return_value
Пример #23
0
def benchmark_multiprocess(rank, world_size, args):

    init_method_pgroup = "tcp://localhost:{}".format(MPI_PORT)
    # TODO(anj-s): Add regression benchmarks for nccl as well.
    torch.distributed.init_process_group(
        backend="gloo", rank=rank, world_size=world_size, init_method=init_method_pgroup
    )

    torch.cuda.set_device(rank % torch.cuda.device_count())
    # TODO(anj-s): Move to TensorPipeRpcBackendOptions.
    rpc.init_rpc(
        f"Test{rank}",
        rank=rank,
        world_size=world_size,
        backend=rpc.BackendType.PROCESS_GROUP,
        rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
            rpc_timeout=20, init_method="tcp://localhost:{}".format(RPC_PORT)
        ),
    )
    initialize_model_parallel(1, world_size)
    init_random_seed(0)
    run_mp_worker(args, world_size)

    rpc.shutdown()
    torch.distributed.destroy_process_group()
Пример #24
0
def run_worker(rank, world_size, num_split):
    # os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_ADDR'] = '172.10.0.2'
    # os.environ['MASTER_PORT'] = '29500'
    os.environ['MASTER_PORT'] = '12345'
    options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256)
    if rank == 0:
        print("Init master")
        rpc.init_rpc("master",
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=options)

        print(rank)
        run_master(num_split)
    else:

        print("init worker rank ", rank)
        rpc.init_rpc(f"worker{rank}",
                     rank=rank,
                     world_size=world_size,
                     rpc_backend_options=options)
        pass
    """
    # source: https://pytorch.org/tutorials/intermediate/dist_tuto.html
    dist.init_process_group(
    init_method='tcp://10.1.1.20:23456',
    rank=args.rank,
    world_size=4)
    """

    # block until all rpcs finish
    rpc.shutdown()
Пример #25
0
def run(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    options=rpc.ProcessGroupRpcBackendOptions(
        num_send_recv_threads=16,
        rpc_timeout=0  # infinite timeout
     )
    if rank != 0:
        rpc.init_rpc(
            f"trainer{rank}",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=options
        )
        # trainer passively waiting for ps to kick off training iterations
    else:
        rpc.init_rpc(
            "ps",
            rank=rank,
            world_size=world_size,
            rpc_backend_options=options
        )
        run_ps([f"trainer{r}" for r in range(1, world_size)])

    # block until all rpcs finish
    rpc.shutdown()
Пример #26
0
    def new_test_method(self, *arg, **kwargs):
        # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted
        # in tests.
        import torch.distributed.rpc.api as api

        api._ignore_rref_leak = False
        self.worker_id = self.rank
        self.setup_fault_injection(faulty_messages, messages_to_delay)

        rpc_backend_options = self.rpc_backend_options
        if setup_rpc:
            if TEST_WITH_TSAN:
                # TSAN runs much slower.
                rpc_backend_options.rpc_timeout = rpc.constants.DEFAULT_RPC_TIMEOUT_SEC * 5
                rpc.constants.DEFAULT_SHUTDOWN_TIMEOUT = 60

            rpc.init_rpc(
                name="worker%d" % self.rank,
                backend=self.rpc_backend,
                rank=self.rank,
                world_size=self.world_size,
                rpc_backend_options=rpc_backend_options,
            )

        return_value = old_test_method(self, *arg, **kwargs)

        if setup_rpc:
            rpc.shutdown(graceful=clean_shutdown)

        return return_value
Пример #27
0
def run_worker(rank, world_size):
    r"""
    This is the entry point for all processes. The rank 0 is the agent. All
    other ranks are observers.
    """
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    if rank == 0:
        # rank0 is the agent
        rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size)

        agent = Agent(world_size)
        for i_episode in count(1):
            n_steps = int(TOTAL_EPISODE_STEP / (args.world_size - 1))
            #n_steps = int(TOTAL_EPISODE_STEP )
            agent.run_episode(n_steps=n_steps)
            last_reward = agent.finish_episode()

            if i_episode % args.log_interval == 0:
                print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                      i_episode, last_reward, agent.running_reward))

            if agent.running_reward > agent.reward_threshold:
                print("Solved! Running reward is now {}!".format(agent.running_reward))
                break
    else:
        # other ranks are the observer
        rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size)
        # observers passively waiting for instructions from agents
    rpc.shutdown()
Пример #28
0
    def _test_rref_leak(self, ignore_leak=False):
        rpc.init_rpc(
            name="worker{}".format(self.rank),
            backend=self.rpc_backend,
            rank=self.rank,
            world_size=self.world_size,
            rpc_backend_options=self.rpc_backend_options,
        )

        # This is for the below `dist.barrier`.
        # For `RpcAgent` other than `ProcessGroupAgent`,
        # no `_default_pg` is initialized.
        if not dist.is_initialized():
            dist.init_process_group(
                backend="gloo",
                init_method=self.init_method,
                rank=self.rank,
                world_size=self.world_size,
            )
        # Wait for all init to complete.
        dist.barrier()

        rref = rpc.remote("worker{}".format((self.rank + 1) % self.world_size),
                          torch.add,
                          args=(torch.ones(2, 2), 1))

        if ignore_leak:
            import torch.distributed.rpc.api as api
            api._ignore_rref_leak = True

        rpc.shutdown()
Пример #29
0
    def destroy_comms(self, destroy_rpc=True):
        # Wait for all ranks to reach here before starting shutdown.
        dist.barrier()

        if destroy_rpc:
            rpc.shutdown()
        dist.destroy_process_group()
Пример #30
0
def run(i):
    global rank
    rank = i
    if i == world_size - 1:
        time.sleep(4)
        print("Process {} delayed start.".format(i))
    rpc.init_rpc("Rank" + str(i),
                 rank=i,
                 world_size=world_size,
                 rpc_backend_options=rpc.ProcessGroupRpcBackendOptions(
                     init_method="env://",
                     rpc_timeout=rpc.timedelta(seconds=2),
                     num_send_recv_threads=4))
    if i == 0:
        time.sleep(2)
        print("Process 0 exit.")
        exit(-1)
    t = time.time()
    reqs = []
    for j in range(messages):
        for r in range(world_size):
            reqs.append(rpc.rpc_async("Rank{}".format(r), test, args=()))
        for req, idx in zip(reqs, range(world_size)):
            try:
                print("{} Received from {} : {}".format(rank, idx, req.wait()))
            except RuntimeError:
                print("An error ocurred while {} receiving results from {}".
                      format(rank, idx))
        reqs.clear()
    print(time.time() - t)
    rpc.shutdown(graceful=False)