def rpc_worker(rank, world_size, init_file, func, *args): if torch_version() == (1, 8, 0): if torch.cuda.is_available(): # Workaround for https://github.com/pytorch/pytorch/issues/53844 options = rpc.TensorPipeRpcBackendOptions( init_method="file://" + init_file, _transports=["ibv", "uv"]) else: # Workaround for https://github.com/pytorch/pytorch/issues/54266 options = rpc.TensorPipeRpcBackendOptions( init_method="file://" + init_file, _channels=[ "mpt_uv", "basic", "cuda_ipc", "cuda_gdr", "cuda_xth", "cuda_basic" ], ) else: options = rpc.TensorPipeRpcBackendOptions(init_method="file://" + init_file) rpc.init_rpc( "worker" + str(rank), rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=options, ) if rank == 0: func(*args) rpc.shutdown()
def run_worker(rank, world_size, num_split): # os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_ADDR'] = '172.10.0.2' # os.environ['MASTER_PORT'] = '29500' os.environ['MASTER_PORT'] = '12345' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256) if rank == 0: print("Init master") rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) print(rank) run_master(num_split) else: print("init worker rank ", rank) rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass """ # source: https://pytorch.org/tutorials/intermediate/dist_tuto.html dist.init_process_group( init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4) """ # block until all rpcs finish rpc.shutdown()
def import_pipe(): global TORCH_PIPE global RPC_INIT try: from torch.distributed.pipeline.sync import Pipe # noqa global Pipe from torch.distributed.pipeline.sync.utils import partition_model global partition_model from torch.distributed import rpc import tempfile TORCH_PIPE = True # Initialize single process RPC agent since TORCH_PIPE requires # RRef. RRef depends on RPC being initialized and as a result we initialize # RPC with a single node. tmpfile = tempfile.NamedTemporaryFile() if not RPC_INIT: rpc.init_rpc(name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), )) RPC_INIT = True logger.info('Using torch pipe') except ImportError: try: from fairscale.nn import Pipe # noqa logger.info('Using fairscale pipe') except ImportError: raise ImportError( "Please install fairscale with: pip install fairscale")
def run_worker(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256, rpc_timeout=600) import psutil p = psutil.Process() if rank == 0: p.cpu_affinity([0]) print( f"Child #{rank}: Set my affinity to {rank}, affinity now {p.cpu_affinity()}", flush=True) rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) run_master() else: p.cpu_affinity([rank - 1]) print( f"Child #{rank}: Set my affinity to {rank}, affinity now {p.cpu_affinity()}", flush=True) rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass # block until all rpcs finish rpc.shutdown()
def test_init_pg_and_rpc_with_same_socket(self): addr = DEFAULT_HOSTNAME port = common.find_free_port() os.environ["MASTER_ADDR"] = addr os.environ["MASTER_PORT"] = str(port) # We internally use a multi-tenant TCP store. Both PG and RPC should successfully # initialize even when using the same socket address. dist.init_process_group( backend="gloo", init_method="env://", rank=0, world_size=1, ) backend_opts = rpc.TensorPipeRpcBackendOptions( init_method=f"tcp://{addr}:{port}") rpc.init_rpc( name="worker0", rank=0, world_size=1, rpc_backend_options=backend_opts, ) rpc.shutdown()
def run(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options=rpc.TensorPipeRpcBackendOptions( num_worker_threads=30, rpc_timeout=0 # infinite timeout ) if rank != 0: rpc.init_rpc( f"trainer{rank}", rank=rank, world_size=world_size, rpc_backend_options=options ) # trainer passively waiting for ps to kick off training iterations else: rpc.init_rpc( "ps", rank=rank, world_size=world_size, rpc_backend_options=options ) run_ps([f"trainer{r}" for r in range(1, world_size)]) # block until all rpcs finish rpc.shutdown()
def run_worker(rank, world_size, num_split): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256, rpc_timeout=600) import psutil p = psutil.Process() if rank == 0: p.cpu_affinity([rank]) rpc.init_rpc( "master", rank=rank, world_size=world_size, rpc_backend_options=options ) run_master(num_split) else: p.cpu_affinity([rank]) rpc.init_rpc( f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options ) pass # block until all rpcs finish rpc.shutdown()
def start(args): rpc.init_rpc(args.name, rank=args.rank, world_size=args.world_size, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( num_worker_threads=args.workers)) rpc.shutdown()
def dist_init(rank: int, world_size: int, filename: str, filename_rpc: str = "") -> bool: """ Initialize torch distributed, based on a temporary file shared across ranks, which makes it possible for unrelated tests to be run concurrently. Return false if not enough GPUs present in the system. .. warning: This limits the usecase to all ranks being on the same node """ print(f"dist init r={rank}, world={world_size}") os.environ["WORLD_SIZE"] = str(world_size) os.environ["RANK"] = str(rank) url = "file://" + filename if torch_version() >= (1, 6, 0): backend = "nccl" if torch.cuda.is_available() else "gloo" if backend == "nccl" and torch.cuda.device_count() < world_size: logging.warning( "Requested world size cannot be reached on this machine, not enough GPUs" ) return False torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=url) url_rpc = "file://" + filename_rpc rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=url_rpc), ) else: if world_size > 1: rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size) elif torch.cuda.is_available(): torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size, init_method=url) else: return False if torch.cuda.is_available() and torch.cuda.device_count(): torch.cuda.set_device(rank % torch.cuda.device_count()) return True
def setup_rpc(scope="session"): file = tempfile.NamedTemporaryFile() rpc.init_rpc(name="worker0", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(file.name), )) yield rpc.shutdown()
def init_rpc(): os.environ["MASTER_PORT"] = "10639" init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" rpc.init_rpc( f"Test{torch.distributed.get_rank()}", rank=torch.distributed.get_rank(), world_size=torch.distributed.get_world_size(), backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=init_method), )
def init_rpc(self): rpc_backend_options = rpc.TensorPipeRpcBackendOptions() rpc_backend_options.init_method = f"file://{self.file_name}" for rank in range(self.world_size): rpc_backend_options.set_device_map(f'worker{rank}', {rank : self.rank, self.rank : rank}) rpc.init_rpc( name="worker%d" % self.rank, rank=self.rank, world_size=self.world_size, rpc_backend_options=rpc_backend_options, )
def run(rank, num_workers, data_dir, model, batch_size, test_batch_size, lr, num_epochs, job_name, target_loss): logging.basicConfig(level=logging.INFO) world_size = num_workers + 2 options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=16, rpc_timeout=0) if rank == 0: logging.info(f"PS{rank} initializing") rpc.init_rpc(f"PS{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) logging.info(f"PS{rank} initialized") workers = [f"worker{r}" for r in range(1, world_size - 1)] ps_rref = rpc.RRef(ParameterServer(model, num_workers, lr, job_name)) futs = [] futs.append( rpc.rpc_async(to="tester", func=get_accuracy, args=(ps_rref, data_dir, test_batch_size, job_name, target_loss))) for worker in workers: futs.append( rpc.rpc_async(to=worker, func=run_worker, args=(ps_rref, data_dir, batch_size, num_epochs, worker, job_name))) torch.futures.wait_all(futs) logging.info(f"Finish training") elif rank == world_size - 1: logging.info(f"Tester initializing") rpc.init_rpc("tester", rank=rank, world_size=world_size, rpc_backend_options=options) logging.info(f"Tester initialized") else: logging.info(f"Worker{rank} initializing") rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) logging.info(f"Worker{rank} initialized") rpc.shutdown()
def rpc_worker(rank, world_size, init_file, func, *args): options = rpc.TensorPipeRpcBackendOptions(init_method="file://" + init_file) for i in range(world_size): options.set_device_map("worker" + str(i), {rank: i}) rpc.init_rpc( "worker" + str(rank), rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=options, ) if rank == 0: func(*args) rpc.shutdown()
def rpc_worker(rank, world_size, batch_size, microbatch_size): options = rpc.TensorPipeRpcBackendOptions(_transports=["ibv", "uv"]) logging.info(f"Initting worker{rank}") rpc.init_rpc( "worker" + str(rank), rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=options, ) if rank == 0: run_master(world_size, batch_size, microbatch_size) rpc.shutdown()
def init_procs(rank, world_size, tr_args=DEFAULTS): # DDP info os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '42069' # RPC info rpc_backend_options = rpc.TensorPipeRpcBackendOptions() rpc_backend_options.init_method = 'tcp://localhost:42068' # Master (RNN module) if rank == world_size - 1: torch.set_num_threads(M_THREADS) # Master gets 16 threads and 4x4 threaded workers # In theory, only 16 threads should run at a time while # master sleeps, waiting on worker procs #torch.set_num_threads(16) rpc.init_rpc('master', rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) rrefs = init_workers(world_size - 1, tr_args['h_size'], TR_START, TR_END, DELTA, False) model, zs, h0 = train(rrefs, tr_args) get_cutoff(model, h0, tr_args) test(model, zs, h0, rrefs) # Slaves else: # If there are 4 workers, give them each 4 threads # (Total 16 is equal to serial model) torch.set_num_threads(W_THREADS) # Slaves are their own process group. This allows # DDP to work between these processes dist.init_process_group('gloo', rank=rank, world_size=world_size - 1) rpc.init_rpc('worker' + str(rank), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # Block until all procs complete rpc.shutdown()
def test_invalid_pg_rpc_ranks(self): self.init_pg() # Init RPC with different ranks. rpc_backend_options = rpc.TensorPipeRpcBackendOptions() rpc_backend_options.init_method = f"file://{self.file_name}" rank = (self.rank + 1) % self.world_size rpc.init_rpc( name=f'worker{rank}', rank=rank, world_size=self.world_size, rpc_backend_options=rpc_backend_options, ) spec = ChunkShardingSpec(dim=0, placements=["rank:1/cuda:1"]) with self.assertRaisesRegex(ValueError, 'Default ProcessGroup and RPC ranks must be the same'): _sharded_tensor.empty(spec, 10, 20)
def bench_mpi(args): guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) os.environ["UCX_NET_DEVICES"] = best_device_map[local_rank] os.environ["MASTER_ADDR"] = args.host os.environ["MASTER_PORT"] = "10638" if args.socket_name: os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name os.environ["TP_SOCKET_IFNAME"] = args.socket_name torch.distributed.init_process_group(backend="gloo", rank=guess_rank, world_size=world_size) os.environ["MASTER_ADDR"] = args.host os.environ["MASTER_PORT"] = "10639" init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( rpc_timeout=20, init_method=init_method), ) backends = { "model_parallel_backend": "nccl", "pipeline_backend": "mpi", "ddp_backend": "nccl" } initialize_model_parallel(1, world_size, **backends) init_random_seed(0) run_mp_worker(args, world_size) rpc.shutdown() torch.distributed.destroy_process_group()
def run(arg, data): name = arg.name remote = arg.remote rpc.init_rpc(name, rank=arg.rank, world_size=arg.world_size, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( num_worker_threads=args.workers)) result = rpc.rpc_sync(remote, collect, args=(data, 2)) async_result = [] t1 = time.time() for i in range(args.workers): async_result.append(rpc.rpc_async(remote, collect, args=(data, 2))) for i in range(args.workers): async_result[i].wait() t2 = time.time() print('RTT of {} tensor is {}'.format(result.shape, t2 - t1)) rpc.shutdown()
def run_remote(config_path: Path, rank: int, nic=None, host=None, prefix: str=None): print(config_path, rank) config = Config.FromYamlFile(config_path) config.world_size = config.num_clients + 1 config.replication_id = prefix nic, host = retrieve_network_params_from_config(config, nic, host) if not nic or not host: print('Missing rank, host, world-size, or nic argument when in \'remote\' mode!') parser.print_help() exit(1) retrieve_env_params(nic, host) print(f'Starting with host={os.environ["MASTER_ADDR"]} and port={os.environ["MASTER_PORT"]} and interface={nic}') options = rpc.TensorPipeRpcBackendOptions( num_worker_threads=16, rpc_timeout=0, # infinite timeout # init_method=f'tcp://{os.environ["MASTER_ADDR"]}:{os.environ["MASTER_PORT"]}' init_method='env://', _transports=["uv"] ) if rank != 0: print(f'Starting worker {rank} with world size={config.world_size}') rpc.init_rpc( f"client{rank}", rank=rank, world_size=config.world_size, rpc_backend_options=options, ) client_node = Client(f'client{rank}', rank, config.world_size, config) client_node.remote_registration() else: print(f'Starting the ps with world size={config.world_size}') rpc.init_rpc( "federator", rank=rank, world_size=config.world_size, rpc_backend_options=options ) federator_node = Federator('federator', 0, config.world_size, config) federator_node.run() federator_node.stop_all_clients() print('Ending program')
def run_worker(rank, world_size, num_split): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256) if rank == 0: rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) run_master(num_split) else: rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass # block until all rpcs finish rpc.shutdown()
def run_single(rank, world_size, host=None, args=None, nic=None): logging.info(f'Starting with rank={rank} and world size={world_size}') if host: os.environ['MASTER_ADDR'] = host else: os.environ['MASTER_ADDR'] = '0.0.0.0' os.environ['MASTER_PORT'] = '5000' if nic: os.environ['GLOO_SOCKET_IFNAME'] = nic os.environ['TP_SOCKET_IFNAME'] = nic else: os.environ['GLOO_SOCKET_IFNAME'] = 'wlo1' os.environ['TP_SOCKET_IFNAME'] = 'wlo1' logging.info( f'Starting with host={os.environ["MASTER_ADDR"]} and port={os.environ["MASTER_PORT"]}' ) options = rpc.TensorPipeRpcBackendOptions( num_worker_threads=16, rpc_timeout=0, # infinite timeout init_method= f'tcp://{os.environ["MASTER_ADDR"]}:{os.environ["MASTER_PORT"]}') if rank != 0: logging.info(f'Starting worker {rank}') rpc.init_rpc( f"client{rank}", rank=rank, world_size=world_size, rpc_backend_options=options, ) # trainer passively waiting for ps to kick off training iterations else: logging.info('Starting the ps') rpc.init_rpc("ps", rank=rank, world_size=world_size, rpc_backend_options=options) run_ps([(f"client{r}", r, world_size) for r in range(1, world_size)], args) # block until all rpc finish rpc.shutdown()
def dist_init(rank: int, world_size: int, hostname: Optional[str] = None) -> None: if hostname is None: hostname = "localhost" print(f"dist init r={rank}, world={world_size}, host={hostname}") os.environ["MASTER_ADDR"] = hostname os.environ["MASTER_PORT"] = "10638" os.environ["WORLD_SIZE"] = str(world_size) os.environ["RANK"] = str(rank) if torch_version() >= (1, 6, 0): init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" backend = "nccl" if torch.cuda.is_available() else "gloo" torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=init_method) os.environ["MASTER_ADDR"] = hostname os.environ["MASTER_PORT"] = "10639" init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=init_method), ) else: if world_size > 1: rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size) else: torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size) if torch.cuda.is_available() and torch.cuda.device_count(): torch.cuda.set_device(rank % torch.cuda.device_count())
def run_worker(rank, world_size): # DDP info os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '42069' # RPC info rpc_backend_options = rpc.TensorPipeRpcBackendOptions() rpc_backend_options.init_method = 'tcp://localhost:42068' # Master if rank == world_size - 1: rpc.init_rpc('master', rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) X_tr, y_tr = gen_toy_data() X_te, y_te = gen_toy_data() rrefs = [] for i in range(world_size - 1): rrefs.append( rpc.remote('worker' + str(i), init_embedder, args=(X_tr.size(2), 10))) train_loop(rrefs, X_tr, y_tr, X_te, y_te) # Slaves else: # Slaves are their own process group. This allows # DDP to work between these processes dist.init_process_group('gloo', rank=rank, world_size=world_size - 1) rpc.init_rpc('worker' + str(rank), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # Block until all procs complete rpc.shutdown()
def run_worker(rank, world_size, num_split): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' # Higher timeout is added to accommodate for kernel compilation time in case of ROCm. options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256, rpc_timeout=300) if rank == 0: rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) run_master(num_split) else: rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass # block until all rpcs finish rpc.shutdown()
def _init_torch_rpc_tp( self, master_addr, master_port, worker_idx, worker_num, ): # https://github.com/pytorch/pytorch/issues/55615 # [BC-Breaking][RFC] Retire ProcessGroup Backend for RPC #55615 str_init_method = "tcp://" + str(master_addr) + ":10000" logging.info("str_init_method = {}".format(str_init_method)) options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=16, rpc_timeout=1800, init_method=str_init_method, _transports=["uv"]) rpc.init_rpc( WORKER.format(worker_idx), backend=rpc.BackendType.TENSORPIPE, rank=worker_idx, world_size=worker_num, rpc_backend_options=options, ) logging.info("_init_torch_rpc_tp finished.")
def init_procs(rank, world_size, rnn_constructor, rnn_args, worker_constructor, worker_args, times, just_test, fw, static, single_emb, run_speed_test, load_fn, manual, tr_args=DEFAULTS): # DDP info os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '42069' # RPC info rpc_backend_options = rpc.TensorPipeRpcBackendOptions() rpc_backend_options.init_method='tcp://localhost:42068' # This is a lot easier than actually changing it in all the methods # at this point global LOAD_FN LOAD_FN = load_fn # Master (RNN module) if rank == world_size-1: torch.set_num_threads(M_THREADS) rpc.init_rpc( 'master', rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options ) # Speed test doesn't need to train the model, # just use random state it starts with and check # how long it takes to run, then return that data if run_speed_test: rrefs = init_empty_workers( world_size-1, worker_constructor, worker_args ) rnn = rnn_constructor(*rnn_args) model = StaticRecurrent(rnn, rrefs) if static\ else DynamicRecurrent(rnn, rrefs) stats = speed_test(model, rrefs, times) stats['delta'] = times['delta'] # Evaluating a pre-trained model, so no need to train elif just_test: rrefs = init_empty_workers( world_size-1, worker_constructor, worker_args ) rnn = rnn_constructor(*rnn_args) model = StaticRecurrent(rnn, rrefs) if static\ else DynamicRecurrent(rnn, rrefs) states = pickle.load(open('model_save.pkl', 'rb')) model.load_states(states['gcn'], states['rnn']) h0 = states['h0'] tpe = 0 # Building and training a fresh model else: rrefs = init_workers( world_size-1, times['tr_start'], times['tr_end'], times['delta'], False, worker_constructor, worker_args ) model, h0, tpe = train(rrefs, tr_args, rnn_constructor, rnn_args, static) if not run_speed_test: h0, zs = get_cutoff(model, h0, times, tr_args, fw) if single_emb: stats = test_single_embed(zs, rrefs, times, model.cutoff) else: stats = test(model, h0, times, rrefs, manual=manual) stats['TPE'] = tpe # Slaves else: torch.set_num_threads(W_THREADS) # Slaves are their own process group. This allows # DDP to work between these processes dist.init_process_group( 'gloo', rank=rank, world_size=world_size-1 ) rpc.init_rpc( 'worker'+str(rank), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options ) # Block until all procs complete rpc.shutdown() # Write output to a tmp file to get it back to the parent process if rank == world_size-1: pickle.dump(stats, open(TMP_FILE, 'wb+'), protocol=pickle.HIGHEST_PROTOCOL)
def __init__( self, name: str, rank: int = -1, world_size: int = -1, init_dist: bool = True, init_rpc: bool = True, dist_backend: str = "gloo", dist_init_method: str = "tcp://localhost:9100", rpc_init_method: str = "tcp://localhost:9101", dist_timeout: float = 60, rpc_timeout: float = 60, ): """ Args: name: A unique name to identify current process. rank: A unique rank of the current process. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` world_size: Size of the distributed world. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` dist_timeout: Distributed package timeout in seconds. rpc_timeout: Global rpc call timeout in seconds. """ self.world_size = world_size self.rank = rank self.name = name self.groups = {} self.group_create_signals = {} if init_dist: dist.init_process_group( backend=dist_backend, init_method=dist_init_method, timeout=timedelta(seconds=dist_timeout), rank=rank, world_size=world_size, ) if init_rpc: rpc.init_rpc( self.name, rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=rpc_init_method, rpc_timeout=rpc_timeout ), ) # get rank-name mapping self.rank_name_map = {} for wi in rpc._get_current_rpc_agent().get_worker_infos(): self.rank_name_map[wi.id] = wi.name # Start role dispatching. self.started = True self.rpc_timeout = rpc_timeout # map for paired values and registered services self.value_lut = {} self.service_lut = {} self.lut_lock = Lock() self.lut_manager = self.rank_name_map[0]
nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 12 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 16 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from torch.distributed import rpc tmpfile = tempfile.NamedTemporaryFile() rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), # Specifying _transports and _channels is a workaround and we no longer # will have to specify _transports and _channels for PyTorch # versions >= 1.8.1 _transports=["ibv", "uv"], _channels=["cuda_ipc", "cuda_basic"], )) num_gpus = 2 partition_len = ((nlayers - 1) // num_gpus) + 1 # Add encoder in the beginning. tmp_list = [Encoder(ntokens, emsize, dropout).cuda(0)] module_list = [] # Add all the necessary transformer blocks. for i in range(nlayers): transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) if i != 0 and i % (partition_len) == 0:
def run_worker(rank, world_size): ###################################################################### # Load and batch data # ------------------- # ###################################################################### # The training process uses Wikitext-2 dataset from ``torchtext``. The # vocab object is built based on the train dataset and is used to numericalize # tokens into tensors. Starting from sequential data, the ``batchify()`` # function arranges the dataset into columns, trimming off any tokens remaining # after the data has been divided into batches of size ``batch_size``. # For instance, with the alphabet as the sequence (total length of 26) # and a batch size of 4, we would divide the alphabet into 4 sequences of # length 6: # # .. math:: # \begin{bmatrix} # \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} # \end{bmatrix} # \Rightarrow # \begin{bmatrix} # \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & # \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & # \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & # \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} # \end{bmatrix} # # These columns are treated as independent by the model, which means that # the dependence of ``G`` and ``F`` can not be learned, but allows more # efficient batch processing. # # In 'run_worker' def print_with_rank(msg): print('[RANK {}]: {}'.format(rank, msg)) from torchtext.datasets import WikiText2 from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator train_iter = WikiText2(split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) def data_process(raw_text_iter): data = [ torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_iter, val_iter, test_iter = WikiText2() train_data = data_process(train_iter) val_data = data_process(val_iter) test_data = data_process(test_iter) device = torch.device(2 * rank) def batchify(data, bsz, rank, world_size, is_train=False): # Divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() # Divide the data across the ranks only for training data. if is_train: data_per_rank = data.size(0) // world_size data = data[rank * data_per_rank:(rank + 1) * data_per_rank] return data.to(device) batch_size = 20 eval_batch_size = 10 train_data = batchify(train_data, batch_size, rank, world_size, True) val_data = batchify(val_data, eval_batch_size, rank, world_size) test_data = batchify(test_data, eval_batch_size, rank, world_size) ###################################################################### # Functions to generate input and target sequence # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ###################################################################### # ``get_batch()`` function generates the input and target sequence for # the transformer model. It subdivides the source data into chunks of # length ``bptt``. For the language modeling task, the model needs the # following words as ``Target``. For example, with a ``bptt`` value of 2, # we’d get the following two Variables for ``i`` = 0: # # .. image:: ../_static/img/transformer_input_target.png # # It should be noted that the chunks are along dimension 0, consistent # with the ``S`` dimension in the Transformer model. The batch dimension # ``N`` is along dimension 1. # # In 'run_worker' bptt = 35 def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) # Need batch dimension first for pipeline parallelism. return data.t(), target ###################################################################### # Model scale and Pipe initialization # ----------------------------------- # ###################################################################### # To demonstrate training large Transformer models using pipeline parallelism, # we scale up the Transformer layers appropriately. We use an embedding # dimension of 4096, hidden size of 4096, 16 attention heads and 8 total # transformer layers (``nn.TransformerEncoderLayer``). This creates a model with # **~1 billion** parameters. # # We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__ # since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ # which allows for future expansion to cross host pipelining. We need to # initialize the RPC framework with only a single worker since we're using a # single process to drive multiple GPUs. # # The pipeline is then initialized with 8 transformer layers on one GPU and 8 # transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and # another across GPUs 2 and 3. Both pipes are then replicated using DistributedDataParallel. # In 'run_worker' ntokens = len(vocab) # the size of vocabulary emsize = 4096 # embedding dimension nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 8 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 16 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from torch.distributed import rpc tmpfile = tempfile.NamedTemporaryFile() rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), # Specifying _transports and _channels is a workaround and we no longer # will have to specify _transports and _channels for PyTorch # versions >= 1.8.1 _transports=["ibv", "uv"], _channels=["cuda_ipc", "cuda_basic"], )) # Num gpus for model parallelism. num_gpus = 2 partition_len = ((nlayers - 1) // num_gpus) + 1 # Add encoder in the beginning. tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)] module_list = [] # Add all the necessary transformer blocks. for i in range(nlayers): transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) if i != 0 and i % (partition_len) == 0: module_list.append(nn.Sequential(*tmp_list)) tmp_list = [] device = i // (partition_len) tmp_list.append(transformer_block.to(2 * rank + device)) # Add decoder in the end. tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1)) module_list.append(nn.Sequential(*tmp_list)) # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing # doesn't work with DDP. from torch.distributed.pipeline.sync import Pipe chunks = 8 model = Pipe(torch.nn.Sequential(*module_list), chunks=chunks, checkpoint="never") # Initialize process group and wrap model in DDP. from torch.nn.parallel import DistributedDataParallel import torch.distributed as dist os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) model = DistributedDataParallel(model) def get_total_params(module: torch.nn.Module): total_params = 0 for param in module.parameters(): total_params += param.numel() return total_params print_with_rank('Total parameters in model: {:,}'.format( get_total_params(model))) ###################################################################### # Run the model # ------------- # ###################################################################### # `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__ # is applied to track the loss and # `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__ # implements stochastic gradient descent method as the optimizer. The initial # learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is # applied to adjust the learn rate through epochs. During the # training, we use # `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__ # function to scale all the gradient together to prevent exploding. # # In 'run_worker' criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) import time def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(vocab) # Train only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, train_data.size(0) - 1) for batch, i in enumerate(range(0, nbatches, bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad() # Since the Pipe is only within a single host and process the ``RRef`` # returned by forward method is local to this node and can simply # retrieved via ``RRef.local_value()``. output = model(data).local_value() # Need to move targets to the device where the output of the # pipeline resides. loss = criterion(output.view(-1, ntokens), targets.cuda(2 * rank + 1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_loss += loss.item() log_interval = 10 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, nbatches // bptt, scheduler.get_last_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() def evaluate(eval_model, data_source): eval_model.eval() # Turn on the evaluation mode total_loss = 0. ntokens = len(vocab) # Evaluate only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, data_source.size(0) - 1) with torch.no_grad(): for i in range(0, nbatches, bptt): data, targets = get_batch(data_source, i) output = eval_model(data).local_value() output_flat = output.view(-1, ntokens) # Need to move targets to the device where the output of the # pipeline resides. total_loss += len(data) * criterion( output_flat, targets.cuda(2 * rank + 1)).item() return total_loss / (len(data_source) - 1) ###################################################################### # Loop over epochs. Save the model if the validation loss is the best # we've seen so far. Adjust the learning rate after each epoch. # In 'run_worker' best_val_loss = float("inf") epochs = 3 # The number of epochs best_model = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train() val_loss = evaluate(model, val_data) print_with_rank('-' * 89) print_with_rank( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print_with_rank('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() ###################################################################### # Evaluate the model with the test dataset # ------------------------------------- # # Apply the best model to check the result with the test dataset. # In 'run_worker' test_loss = evaluate(best_model, test_data) print_with_rank('=' * 89) print_with_rank( '| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print_with_rank('=' * 89)