def run_benchmark(rank, model, data, config): world_size = config.trainer_count + config.ps_count + 1 os.environ['MASTER_ADDR'] = config.master_addr os.environ['MASTER_PORT'] = config.master_port rpc_backend_options = TensorPipeRpcBackendOptions() rpc_backend_options.init_method = config.rpc_init_method if rank == world_size - 1: # master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count] run_master(rank, model, data, config, rpc_backend_options) elif rank >= config.trainer_count: # parameter_servers = [trainer_count, trainer_count + parameter_server_count) rpc.init_rpc(get_name(rank, config), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) else: # trainers = [0, trainer_count) trainer_config = config.trainer_config ps_config = config.ps_config if (USE_CUDA_RPC in trainer_config and trainer_config[USE_CUDA_RPC] and USE_CUDA_RPC in ps_config and ps_config[USE_CUDA_RPC] and config.ps_count > 0): ps_rank = get_parameter_server_rank(rank, config) ps_name = get_name(ps_rank, config) rpc_backend_options.set_device_map(ps_name, {rank: ps_rank}) trainer_name = get_name(rank, config) rpc.init_rpc(trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) rpc.shutdown()
def run_benchmark(rank, model, data, args, config): torch.manual_seed(args.torch_seed) torch.cuda.manual_seed_all(args.cuda_seed) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1 os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.master_port rpc_backend_options = TensorPipeRpcBackendOptions( rpc_timeout=args.rpc_timeout) if rank == world_size - 1: # master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver] run_master(rank, model, data, args, config, rpc_backend_options) elif rank >= args.ntrainer + args.ncudatrainer: # parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver) rpc.init_rpc(get_name(rank, args), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) else: # trainers = [0, ntrainer + ncudatrainer) if rank >= args.ntrainer: server_rank = get_cuda_server_rank(args, rank) server_name = get_name(server_rank, args) rpc_backend_options.set_device_map(server_name, {rank: server_rank}) trainer_name = get_name(rank, args) rpc.init_rpc(trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) rpc.shutdown()
def run_benchmark(rank, args, data): r""" A function that runs the benchmark. Args: rank (int): process number in the world args (parser): configuration args data (list): training samples """ config = load_extra_configs(args) torch.manual_seed(args.torch_seed) torch.cuda.manual_seed_all(args.cuda_seed) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True world_size = args.ntrainer + args.ncudatrainer + args.nserver + args.ncudaserver + 1 os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.master_port rpc_backend_options = TensorPipeRpcBackendOptions(rpc_timeout=args.rpc_timeout) if rank == world_size - 1: # master = [ntrainer + ncudatrainer + nserver + ncudaserver, ntrainer + ncudatrainer + nserver + ncudaserver] run_master(rank, data, args, config, rpc_backend_options) elif rank >= args.ntrainer + args.ncudatrainer: # parameter_servers = [ntrainer + ncudatrainer, ntrainer + ncudatrainer + nserver + ncudaserver) rpc.init_rpc( get_name( rank, args ), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options ) else: # trainers = [0, ntrainer + ncudatrainer) if rank >= args.ntrainer: server_rank = get_cuda_server_rank(args, rank) server_name = get_name(server_rank, args) rpc_backend_options.set_device_map( server_name, {rank: server_rank} ) trainer_name = get_name( rank, args ) rpc.init_rpc( trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options ) rpc.shutdown()
def run_worker(rank, world_size): r""" A wrapper function that initializes RPC, calls the function, and shuts down RPC. """ # We need to use different port numbers in TCP init_method for init_rpc and # init_process_group to avoid port conflicts. rpc_backend_options = TensorPipeRpcBackendOptions() rpc_backend_options.init_method = "tcp://localhost:29501" # Rank 2 is master, 3 is ps and 0 and 1 are trainers. if rank == 2: rpc.init_rpc( "master", rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options, ) # Build the embedding table on the ps. emb_rref = rpc.remote( "ps", torch.nn.EmbeddingBag, args=(NUM_EMBEDDINGS, EMBEDDING_DIM), kwargs={"mode": "sum"}, ) # Run the training loop on trainers. futs = [] for trainer_rank in [0, 1]: trainer_name = "trainer{}".format(trainer_rank) fut = rpc.rpc_async(trainer_name, _run_trainer, args=(emb_rref, rank)) futs.append(fut) # Wait for all training to finish. for fut in futs: fut.wait() elif rank <= 1: # Initialize process group for Distributed DataParallel on trainers. dist.init_process_group(backend="gloo", rank=rank, world_size=2, init_method="tcp://localhost:29500") # Initialize RPC. trainer_name = "trainer{}".format(rank) rpc.init_rpc( trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options, ) # Trainer just waits for RPCs from master. else: rpc.init_rpc( "ps", rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options, ) # parameter server do nothing pass # block until all rpcs finish rpc.shutdown()
def run_worker(rank, world_size): r""" A wrapper function that initializes RPC, calls the function, and shuts down RPC. """ # Using different port numbers in TCP init_method for init_rpc and # init_process_group to avoid port conflicts. rpc_backend_options = TensorPipeRpcBackendOptions() rpc_backend_options.init_method = "tcp://localhost:29500" # Rank 16. Master if rank == (NUM_TRAINERS + NUM_PS): rpc.init_rpc( "master", rank=rank, backend=BackendType.TENSORPIPE, # type: ignore[attr-defined] world_size=world_size) # Build the Embedding tables on the Parameter Servers. emb_rref_list = [] index = 0 while index < NUM_PS: ps_name = "ps{}".format(index) emb_rref = rpc.remote( ps_name, torch.nn.EmbeddingBag, args=(NUM_EMBEDDINGS, EMBEDDING_DIM), kwargs={"mode": "sum"}, ) emb_rref_list.append(emb_rref) index += 1 # Run training loop on the trainers. futs = [] for trainer_rank in range(NUM_TRAINERS): trainer_name = "trainer{}".format(trainer_rank) fut = rpc.rpc_async(trainer_name, _run_trainer, args=(emb_rref_list, trainer_rank)) futs.append(fut) _print_header() measurements_all_trainers = [] batch_size_all_trainers = 0 # Wait for all training to finish. for fut in futs: rank, measurements, batch_size = fut.wait() _print_benchmark("Trainer{}".format(rank), batch_size, measurements) batch_size_all_trainers += batch_size measurements_all_trainers.append(measurements) _print_benchmark("All", batch_size_all_trainers, measurements_all_trainers) # Rank 0-7. Trainers elif rank >= 0 and rank < NUM_PS: # Initialize process group for Distributed DataParallel on trainers. dist.init_process_group( backend=dist.Backend.GLOO, rank=rank, world_size=NUM_TRAINERS, init_method="tcp://localhost:29501", ) # Initialize RPC. Trainer just waits for RPCs from master. trainer_name = "trainer{}".format(rank) rpc.init_rpc( trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options, ) # Rank 8-15. Parameter Servers elif rank >= NUM_TRAINERS and rank < NUM_TRAINERS + NUM_PS: ps_name = "ps{}".format(rank - NUM_TRAINERS) rpc.init_rpc( ps_name, rank=rank, world_size=world_size, backend=BackendType.TENSORPIPE, # type: ignore[attr-defined] rpc_backend_options=rpc_backend_options, ) # parameter server do nothing pass # block until all rpcs finish rpc.shutdown()