lr = opt_args['lr'] #initiating the GAR gar = aggregators.gars.get(gar) assert gar is not None os.environ['MASTER_ADDR'] = master os.environ['MASTER_PORT'] = '29500' torch.manual_seed(1234) #For reproducibility if torch.cuda.is_available(): torch.cuda.manual_seed_all(1234) #For reproducibility if bench: torch.backends.cudnn.benchmark = True #convention: low ranks are reserved for parameter servers if rank < num_ps: rpc.init_rpc('ps:{}'.format(rank), rank=rank, world_size=world_size) #Initialize a parameter server and write the training loop ps = Server(rank, world_size, num_workers, 1, fw, fps, 'worker:', 'ps:', batch, model, dataset, optimizer, **opt_args) scheduler = torch.optim.lr_scheduler.MultiStepLR( ps.optimizer, milestones=[150, 250, 350], gamma=0.1 ) #This line shows sophisticated stuff that can be done out of the Garfield++ library start_time = time() iter_per_epoch = CIFAR_NUM_SAMPLES // ( num_workers * batch) #this value records how many iteration per sample print("One EPOCH consists of {} iterations".format(iter_per_epoch)) sys.stdout.flush() for i in range(num_iter): if i % ( iter_per_epoch * 30 ) == 0 and i != 0: #One hack for better convergence with Cifar10
def run_worker(rank, world_size): r""" A wrapper function that initializes RPC, calls the function, and shuts down RPC. """ # We need to use different port numbers in TCP init_method for init_rpc and # init_process_group to avoid port conflicts. rpc_backend_options = ProcessGroupRpcBackendOptions() rpc_backend_options.init_method = 'tcp://localhost:29501' # Rank 2 is master, 3 is ps and 0 and 1 are trainers. if rank == 2: rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # Build the embedding table on the ps. emb_rref = rpc.remote("ps", torch.nn.EmbeddingBag, args=(NUM_EMBEDDINGS, EMBEDDING_DIM), kwargs={"mode": "sum"}) # Run the training loop on trainers. futs = [] for trainer_rank in [0, 1]: trainer_name = "trainer{}".format(trainer_rank) fut = rpc.rpc_async(trainer_name, _run_trainer, args=(emb_rref, rank)) futs.append(fut) # Wait for all training to finish. for fut in futs: fut.wait() elif rank <= 1: # Initialize process group for Distributed DataParallel on trainers. dist.init_process_group(backend="gloo", rank=rank, world_size=2, init_method='tcp://localhost:29500') # Initialize RPC. trainer_name = "trainer{}".format(rank) rpc.init_rpc(trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # Trainer just waits for RPCs from master. else: rpc.init_rpc("ps", rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) # parameter server do nothing pass # block until all rpcs finish rpc.shutdown()
ntokens = len(vocab) # the size of vocabulary emsize = 4096 # embedding dimension nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 12 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 16 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from torch.distributed import rpc tmpfile = tempfile.NamedTemporaryFile() rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), # Specifying _transports and _channels is a workaround and we no longer # will have to specify _transports and _channels for PyTorch # versions >= 1.8.1 _transports=["ibv", "uv"], _channels=["cuda_ipc", "cuda_basic"], )) num_gpus = 2 partition_len = ((nlayers - 1) // num_gpus) + 1 # Add encoder in the beginning. tmp_list = [Encoder(ntokens, emsize, dropout).cuda(0)] module_list = [] # Add all the necessary transformer blocks. for i in range(nlayers):
def run_worker(rank, world_size, args): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' if rank == 0: # rank0 is the agent rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size) logdir = "./data/gac-parallel/{}/{}-seed{}-{}".format( args.env_name, args.env_name, args.seed, time()) config_name = 'config.json' file_name = 'progress.csv' model_name = 'model.pt' if not os.path.exists(logdir): os.makedirs(logdir) config_json = json.dumps(args._asdict()) config_json = json.loads(config_json) output = json.dumps(config_json, separators=(',', ':\t'), indent=4, sort_keys=True) with open(os.path.join(logdir, config_name), 'w') as out: out.write(output) full_name = os.path.join(logdir, file_name) csvfile = open(full_name, 'w') writer = csv.writer(csvfile, delimiter='\t') writer.writerow( ['TotalEnvInteracts', 'AverageTestEpRet', 'AverageTestEpLen']) full_model_name = os.path.join(logdir, model_name) agent = Agent(world_size, args) print("Replay buffer warms up.") agent.run_episode(args.start_steps, True) print("End.") print( "=================================================================" ) for t1 in range(args.total_epoch): for t2 in range(int(args.steps_per_epoch / args.steps_per_update)): agent.run_episode(args.steps_per_update) agent.update() test_ret, test_len = agent.test_episode() t = t1 * args.steps_per_epoch + (t2 + 1) * args.steps_per_update print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format( t, test_ret, test_len)) print( "-----------------------------------------------------------") writer.writerow([t, test_ret, test_len]) csvfile.flush() torch.save(agent.actor_critic, full_model_name) else: rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size) rpc.shutdown()
def run_worker(): rpc.init_rpc(name=f"trainer_{config.rank}", rank=config.rank, world_size=config.world_size) logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = True # get data with meta info # input_size, input_channels, n_classes, train_data = utils.get_data( # config.dataset, config.data_path, cutout_length=0, validation=False) # net_crit = nn.CrossEntropyLoss().to(device) # model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers, # net_crit, device_ids=config.gpus) # model = model.to(device) model = TrainerNet(net_crit) # weights optimizer # w_optim = torch.optim.SGD(model.weights(), config.w_lr, momentum=config.w_momentum, # weight_decay=config.w_weight_decay) w_optim = DistributedOptimizer(torch.optim.SGD, model.weights(), lr=config.w_lr, momentum=config.w_momentum, weight_decay=config.w_weight_decay) # alphas optimizer # alpha_optim = torch.optim.Adam(model.alphas(), config.alpha_lr, betas=(0.5, 0.999), # weight_decay=config.alpha_weight_decay) alpha_optim = DistributedOptimizer(torch.optim.Adam, model.alphas(), lr=config.alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay) # split data to train/validation n_train = len(train_data) split = n_train // 2 world = config.world_size rank = config.rank indices = list(range(n_train)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[int(rank * split / world):int((rank + 1) * split / world)]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler( indices[split + int(rank * (n_train - split) / world):split + int(int((rank + 1) * (n_train - split) / world))]) train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, sampler=train_sampler, num_workers=config.workers, pin_memory=True) valid_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, sampler=valid_sampler, num_workers=config.workers, pin_memory=True) # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( # w_optim, config.epochs, eta_min=config.w_lr_min) lrs_rrefs = [] for opt_rref in w_optim.remote_optimizers: lrs_rrefs.append( rpc.remote(opt_rref.owner(), create_lr_scheduler, args=(opt_rref, ))) v_model = SearchCNNController(input_channels, config.init_channels, n_classes, config.layers, nn.CrossEntropyLoss().to(device), device_ids=config.gpus).to(device) architect = Architect(model, v_model, config.w_momentum, config.w_weight_decay, noise_add) if noise_add: logger.info("Adding noise") for param in model.parameters(): shape_gaussian[param.data.shape] = gaussian.MultivariateNormal( torch.zeros(param.data.shape), torch.eye(param.data.shape[-1])) else: logger.info("Not adding noise") # training loop best_top1 = 0. for epoch in range(config.epochs): with dist_autograd.context() as cid: futs = [] for lrs_rref in lrs_rrefs: futs.append( rpc.rpc_async(lrs_rref.owner(), lrs_step, args=(lrs_rref, ))) [fut.wait() for fut in futs] lr = remote_method(get_lrs_value, lrs_rrefs.owner(), args=(lrs_rrefs[0], )) # lr_scheduler.step() # lr = lr_scheduler.get_lr()[0] # model.print_alphas(logger) # training train(train_loader, valid_loader, model, architect, w_optim, alpha_optim, lr, epoch) # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, epoch, cur_step) # log # genotype genotype = model.genotype() logger.info("genotype = {}".format(genotype)) # genotype as a image plot_path = os.path.join(config.plot_path, "EP{:02d}".format(epoch + 1)) caption = "Epoch {}".format(epoch + 1) plot(genotype.normal, plot_path + "-normal", caption) plot(genotype.reduce, plot_path + "-reduce", caption) # save if best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False utils.save_checkpoint(model, config.path, is_best) print("") logger.info("Final best Prec@1 = {:.4%}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype)) rpc.shutdown()
def dist_init(rank: int, world_size: int, filename: str, filename_rpc: str = "") -> bool: """ Initialize torch distributed, based on a temporary file shared across ranks, which makes it possible for unrelated tests to be run concurrently. Return false if not enough GPUs present in the system. .. warning: This limits the usecase to all ranks being on the same node """ try: torch.distributed.rpc.shutdown() except Exception: pass print(f"dist init r={rank}, world={world_size}") os.environ["WORLD_SIZE"] = str(world_size) os.environ["RANK"] = str(rank) url = "file://" + filename url_rpc = "file://" + filename_rpc if torch_version() >= (1, 6, 0): backend = "nccl" if torch.cuda.is_available() else "gloo" if backend == "nccl" and torch.cuda.device_count() < world_size: logging.warning( "Requested world size cannot be reached on this machine, not enough GPUs" ) return False torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size, init_method=url) tp_options = {"init_method": url_rpc} # Workaround for bug in torch v1.8.0. Should be fixed in v1.8.1 if torch_version() == (1, 8, 0): tp_options["_transports"] = ["uv"] # type: ignore rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions(**tp_options), ) else: if world_size > 1: # TensorPipe is not available in Torch 1.5 rpc.init_rpc( name=f"Test{rank}", rank=rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=url_rpc), ) elif torch.cuda.is_available(): torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size, init_method=url) else: return False if torch.cuda.is_available() and torch.cuda.device_count(): torch.cuda.set_device(rank % torch.cuda.device_count()) return True
def _run_threads(rank, world_size, env_spawner, model, optimizer, flags): """Initializes RPC clients. Intended use as target function for :py:func:`torch.multiprocessing.spawn()`. * Spawns a :py:class:`~pytorch_seed_rl.agents.Learner` as client with rank 0. * Spawns :py:class:`~pytorch_seed_rl.agents.Actor` as client with rank greater than 0. Parameters ---------- rank: `int` The rank of the client within the multiprocessing Processgroup. worldsize: `int` The total number of clients within the multiprocessing Processgroup. env_spawner : :py:class:`~pytorch_seed_rl.environments.env_spawner.EnvSpawner` Object that spawns an environment on invoking it's :py:meth:`~pytorch_seed_rl.environments.env_spawner.EnvSpawner.spawn()` method. model : :py:class:`torch.nn.Module` A torch model that processes frames as returned by an environment spawned by :py:attr:`env_spawner` optimizer : :py:class:`torch.nn.Module` A torch optimizer that links to :py:attr:`model` """ os.environ['MASTER_ADDR'] = flags.master_address os.environ['MASTER_PORT'] = flags.master_port if flags.tensorpipe: backend = rpc.BackendType.TENSORPIPE else: backend = rpc.BackendType.PROCESS_GROUP if rank == 0: rpc.init_rpc( LEARNER_NAME.format(rank), backend=backend, rank=rank, world_size=world_size, ) learner_rref = rpc.remote(LEARNER_NAME.format(rank), Learner, args=(rank, flags.num_actors, env_spawner, model, optimizer), kwargs={ 'save_path': os.path.join(flags.savedir, flags.name), 'pg_cost': flags.pg_cost, 'baseline_cost': flags.baseline_cost, 'entropy_cost': flags.entropy_cost, 'discounting': flags.discounting, 'grad_norm_clipping': flags.grad_norm_clipping, 'reward_clipping': flags.reward_clipping == 'abs_one', 'batchsize_training': flags.batchsize_training, 'rollout': flags.rollout, 'total_steps': flags.total_steps, 'max_epoch': flags.max_epoch, 'max_time': flags.max_time, 'threads_prefetch': flags.threads_prefetch, 'threads_inference': flags.threads_inference, 'threads_store': flags.threads_store, 'render': flags.render, 'max_gif_length': flags.max_gif_length, 'verbose': flags.verbose, 'print_interval': flags.print_interval, 'system_log_interval': flags.system_log_interval, 'checkpoint_interval': flags.checkpoint_interval, 'load_checkpoint': flags.load_checkpoint, 'max_queued_batches': flags.max_queued_batches, 'max_queued_drops': flags.max_queued_drops, }) learner_rref.remote().loop() while not learner_rref.rpc_sync().get_shutdown(): time.sleep(1) else: rpc.init_rpc( ACTOR_NAME.format(rank), backend=backend, rank=rank, world_size=world_size, ) # block until all rpcs finish try: rpc.shutdown() except RuntimeError: # RPC connection shut down return
def run_test_pipe(rank, world_size, filename, filename_rpc, skip_dist_init=False): pipe_world_size = 2 if world_size == 1: return if not skip_dist_init: dist_init(rank, world_size, filename, filename_rpc) else: os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29502" rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size) mpu.initialize_model_parallel(world_size / pipe_world_size, pipe_world_size) model_parallel_size = mpu.get_model_parallel_world_size() if torch.distributed.get_rank() == 0: print( "> testing Sequential + MultiProcessPipe with model parallel size: {}, pipe: {}" .format(model_parallel_size, pipe_world_size)) chunk_size = 4 seed = 12345 set_random_seed(seed) input_size_coeff = 3 input_size = input_size_coeff * model_parallel_size output_size_coeff = 7 output_size = output_size_coeff * model_parallel_size batch_size = 3 * chunk_size target = torch.rand((batch_size, input_size), requires_grad=True).cuda() print(f"target = {target}") identity = IdentityLayer2D(batch_size, input_size).cuda() pipeline_devices = mpu.get_pipeline_parallel_group() set_random_seed(seed) model = nn.Sequential( layers.ColumnParallelLinear(input_size, output_size, keep_master_weight_for_test=True, bias=False).cuda(), nn.ReLU(), layers.RowParallelLinear(output_size, input_size, keep_master_weight_for_test=True, bias=False).cuda(), ) set_random_seed(seed) reference = [ nn.Linear(input_size, output_size, bias=False).cuda(), nn.ReLU(), nn.Linear(output_size, input_size, bias=False).cuda(), ] print( f"setup {reference[0].weight.size()}, {model[0].weight.size()}, {(input_size, output_size)}" ) print(f"setup {reference[2].weight.size()}, {(output_size, input_size)}") reference[0].weight = Parameter( model[0].get_master_weight().clone()).cuda() reference[2].weight = Parameter( model[2].get_master_weight().clone()).cuda() reference = nn.Sequential(*reference) def grad_graph(depth, grad): result = depth * " " + str(grad) if grad: for x in grad.next_functions: result += "\n" + grad_graph(depth + 1, x[0]) return result def check_weights(x, y, key: str, index=None): for i in [2, 0]: if index is not None and i != index: continue left = x[i].get_master_weight() right = y[i].weight.data if not torch.allclose(left, right, atol=1.0e-6) or index is not None: print( f"check_weights {key}-{i}: left = {left}, \nright = {right}" ) if not torch.equal(left, right): print( f"check_weights NOT_EQUAL {key}-{i}: left = {left}, \nright = {right}" ) assert torch.allclose(left, right, atol=1.0e-6) def dump_opt_params(opt): for i, group in enumerate(opt.param_groups): for j, p in enumerate(group["params"]): print(f"{torch.distributed.get_rank()}:param {(i,j)} = {p}") print( f"{torch.distributed.get_rank()}:param.grad {(i,j)} = {p.grad}" ) def forward_model(model_, target, step=False): optimizer = torch.optim.SGD(model_.parameters(), lr=0.01, momentum=0.9) optimizer.zero_grad() model_.zero_grad() output = model_(identity()) loss = nn.MSELoss() model_.zero_grad() if step: loss(output, target).backward() saved_weight_0 = model_[0].weight.data.clone() saved_weight_2 = model_[2].weight.data.clone() dump_opt_params(optimizer) optimizer.step() assert not torch.allclose( saved_weight_0, model_[0].weight.data, atol=1.0e-6) assert not torch.allclose( saved_weight_2, model_[2].weight.data, atol=1.0e-6) return output output = forward_model(model, target) reference_output = forward_model(reference, target) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 output = forward_model(model, target) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 output = forward_model(model, target) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 check_weights(model, reference, "before") saved_weight_0 = model[0].weight.data.clone() saved_weight_2 = model[2].weight.data.clone() output = forward_model(model, target, step=True) error = reference_output.sub(output).max() assert error < 1.0e-6 model[0].weight.data = saved_weight_0 model[2].weight.data = saved_weight_2 worker_map = { i: f"Test{i}" for i in range(torch.distributed.get_world_size()) } if pipe_world_size == 2: print(f"actually doing pipe stuff now") assert torch.equal(saved_weight_0, model[0].weight.data) assert torch.equal(saved_weight_2, model[2].weight.data) pipe_model = MultiProcessPipe( model, [2, 1], group=pipeline_devices, worker_map=worker_map, input_device=torch.cuda.current_device(), chunks=chunk_size, pipelined_backward=True, ).cuda() torch.distributed.barrier() pipe_rank = torch.distributed.get_rank( group=mpu.get_pipeline_parallel_group()) print(f"pipe rank is {pipe_rank}") if pipe_rank == 0: assert torch.equal(saved_weight_0, pipe_model[0].weight.data) else: if not torch.equal(saved_weight_2, pipe_model[0].weight.data): print( f"ne {pipe_rank}: left\n{saved_weight_2}\nright:\n{pipe_model[0].weight.data}" ) assert torch.equal(saved_weight_2, pipe_model[0].weight.data) optimizer = torch.optim.SGD(pipe_model.parameters(), lr=0.01, momentum=0.9) optimizer.zero_grad() if pipe_rank == 0: assert torch.equal(saved_weight_0, pipe_model[0].weight.data) print(f"runner {rank}:\n{pipe_model[0].weight.data}") else: assert torch.equal(saved_weight_2, pipe_model[0].weight.data) print(f"runner {rank}:\n{pipe_model[0].weight.data}") if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1: check_weights(model, reference, "pre-pipe", index=2) else: check_weights(model, reference, "pre-pipe", index=0) pipe_output = pipe_model(identity()) print(f"exited pipe for {rank}") forward_model(reference, target, step=True) print(f"pipe_output {rank} = {pipe_output}") print(f"reference_output {rank} = {reference_output}") torch.distributed.barrier() if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1: error = reference_output.sub(pipe_output.cuda()).max() if error >= 1.0e-6: print(f"error bad {error}") assert error < 1.0e-6 loss = nn.MSELoss() failed = False pipe_output.retain_grad() with torch.autograd.profiler.profile() as prof: try: loss(pipe_output, target).backward() except Exception as e: failed = True print(f"got {e} while doing backward, deadlock?") if failed: raise RuntimeError("failed somehow") dump_opt_params(optimizer) optimizer.step() print(f"calling check_weights on master") check_weights(model, reference, "pipe", index=2) print(f"waiting for barrier on master, pid={os.getpid()}") else: print(f"calling backwards on slave, pid={os.getpid()}") failed = False with torch.autograd.profiler.profile() as prof: try: pipe_model.back_helper(pipe_output) except Exception as e: failed = True print(f"got {e} while doing backward, deadlock?") if failed: raise RuntimeError("failed somehow") dump_opt_params(optimizer) print(f"calling step on slave") optimizer.step() print(f"calling check_weights on slave") check_weights(model, reference, "pipe", index=0) print(f"waiting for barrier on slave") pipe_model.zero_grad() torch.distributed.barrier() pipe_model.eval() pipe_output = pipe_model(identity()) updated_ref_output = forward_model(reference, target) if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1: error = updated_ref_output.sub(pipe_output.cuda()).max() print( f"outputs are ref:\n{updated_ref_output}\npipe:\n{pipe_output}" ) assert error < 1.0e-6 torch.distributed.barrier() print(f"finished waiting for barrier on, pid={os.getpid()}") print(f"really exited pipe for {rank}") rpc.shutdown() torch.distributed.destroy_process_group()
import os import torch import torch.distributed.rpc as rpc os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '7030' # FIXME: 函数的定义必须在master和worker中都有,类似存根的概念; def hello(): print("hello!") return "hi, shuang" # FIXME: rank数量还没达到 world size,程序会block; rpc.init_rpc("master", rank=0, world_size=2) rpc.shutdown()
ps the local server object gar GAR used for aggregation aggr_grad the initial aggregated gradient num_iter the number of iterations to be done; should be log2(t) num_wait_ps the number of servers that should be waited for """ for _ in range(num_iter): ps.latest_aggr_grad = aggr_grad aggr_grads = ps.get_aggr_grads(num_wait_ps) aggr_grad = gar(gradients=aggr_grads, f=f) return aggr_grad #No branching here! All nodes are created equal: no PS and no workers #Basically, each node has one PS object and one worker object rpc.init_rpc('node:{}'.format(rank), rank=rank, world_size=world_size) #rpc._set_rpc_timeout(100000) #initialize a worker here...the worker is created first because the server relies on the worker creation Worker(rank, world_size, n, batch, model, dataset, loss) #Initialize a parameter server ps = Server(rank, world_size, n, n, f, f, 'node:', 'node:', batch, model, dataset, optimizer, **opt_args) sleep(20) #works as a synchronization step scheduler = torch.optim.lr_scheduler.MultiStepLR( ps.optimizer, milestones=[150, 250, 350], gamma=0.1 ) #This line shows sophisticated stuff that can be done out of the Garfield++ library start_time = time() iter_per_epoch = CIFAR_NUM_SAMPLES // ( n * batch) #this value records how many iteration per sample print("One EPOCH consists of {} iterations".format(iter_per_epoch)) sys.stdout.flush()
def run_worker(rank, world_size): r""" A wrapper function that initializes RPC, calls the function, and shuts down RPC. """ # Using different port numbers in TCP init_method for init_rpc and # init_process_group to avoid port conflicts. rpc_backend_options = TensorPipeRpcBackendOptions() rpc_backend_options.init_method = "tcp://localhost:29500" # Rank 16. Master if rank == (NUM_TRAINERS + NUM_PS): rpc.init_rpc( "master", rank=rank, backend=BackendType.TENSORPIPE, # type: ignore[attr-defined] world_size=world_size) # Build the Embedding tables on the Parameter Servers. emb_rref_list = [] index = 0 while index < NUM_PS: ps_name = "ps{}".format(index) emb_rref = rpc.remote( ps_name, torch.nn.EmbeddingBag, args=(NUM_EMBEDDINGS, EMBEDDING_DIM), kwargs={"mode": "sum"}, ) emb_rref_list.append(emb_rref) index += 1 # Run training loop on the trainers. futs = [] for trainer_rank in range(NUM_TRAINERS): trainer_name = "trainer{}".format(trainer_rank) fut = rpc.rpc_async(trainer_name, _run_trainer, args=(emb_rref_list, trainer_rank)) futs.append(fut) _print_header() measurements_all_trainers = [] batch_size_all_trainers = 0 # Wait for all training to finish. for fut in futs: rank, measurements, batch_size = fut.wait() _print_benchmark("Trainer{}".format(rank), batch_size, measurements) batch_size_all_trainers += batch_size measurements_all_trainers.append(measurements) _print_benchmark("All", batch_size_all_trainers, measurements_all_trainers) # Rank 0-7. Trainers elif rank >= 0 and rank < NUM_PS: # Initialize process group for Distributed DataParallel on trainers. dist.init_process_group( backend=dist.Backend.GLOO, rank=rank, world_size=NUM_TRAINERS, init_method="tcp://localhost:29501", ) # Initialize RPC. Trainer just waits for RPCs from master. trainer_name = "trainer{}".format(rank) rpc.init_rpc( trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options, ) # Rank 8-15. Parameter Servers elif rank >= NUM_TRAINERS and rank < NUM_TRAINERS + NUM_PS: ps_name = "ps{}".format(rank - NUM_TRAINERS) rpc.init_rpc( ps_name, rank=rank, world_size=world_size, backend=BackendType.TENSORPIPE, # type: ignore[attr-defined] rpc_backend_options=rpc_backend_options, ) # parameter server do nothing pass # block until all rpcs finish rpc.shutdown()
def __init__(self, world_size: int, current_rank: int, roles: Dict[str, Tuple[type, int]], init_method: str = "tcp://localhost:9100", rpc_timeout: int = 60, rpc_threads: int = 4, rpc_role_dispatcher: Any = None): """ Args: world_size: Size of distributed world. current_rank: A unique rank of current process. roles: A list of roles executed by all processes. init_method: Backend initialization method. rpc_timeout: Global rpc call timeout in seconds. rpc_threads: Rpc recv/send thread num. rpc_role_dispatcher: Rpc role dispatch, by default it is :class:`~machin.parallel.distributed.\ RoleDispatcherElection` and uses :class:`machin.parallel.\ distributed.ElectionGroupStableRpc` as its internal election implementation. """ self.world_size = world_size self.role_dict = roles # Maps role Tuple[str, int] to threads self.role_threads = {} self.current_rank = current_rank self.ranks = [i for i in range(world_size)] self.real_names = ["{}".format(i) for i in range(world_size)] self.groups = {} if rpc_role_dispatcher is not None: self.rpc_role_dispatcher = rpc_role_dispatcher else: role_names = list(roles.keys()) role_nums = [val[1] for val in roles.values()] self.rpc_role_dispatcher = RoleDispatcherElection( current_rank, world_size, role_names, role_nums, ElectionGroupStableRpc(name="global", member_ranks=self.ranks, rank=current_rank, timeout=rpc_timeout)) # "<rank-number>" is used as the unique name. rpc.init_rpc("{}".format(self.current_rank), rank=current_rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=init_method, num_send_recv_threads=rpc_threads, rpc_timeout=timedelta(seconds=rpc_timeout))) # Start role dispatching. self.rpc_role_dispatcher.start() while True: self.rpc_role_dispatcher.get_role_update_cond().wait() for role in self.rpc_role_dispatcher.get_roles(): if role not in self.role_threads: role_class = self.role_dict[role[0]][0] role_thread = Thread(target=_exec_role, args=(role_class(role[1]), )) role_thread.start() self.role_threads[role] = role_thread
hid_dim=args.hid_dim, block_type=args.block_type, batch_size=args.batch_size, input_factory=name_to_input[args.block_type], workers=[f'worker{rank}' for rank in range(1, args.world_size)]) avg_throughput, std_throughput = measure_func(ModelParallelRPC) print(f'ModelParallel:\t{avg_throughput:.2f}±{std_throughput:.2f}') if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--hid-dim', type=int, default=1024) parser.add_argument('--batches-for-latency', type=int, default=10) parser.add_argument('--batches-for-throughput', type=int, default=100) parser.add_argument('--batch-size', type=int, default=2048) parser.add_argument('--throughput-runs', type=int, default=10) parser.add_argument('--rank', type=int, required=True) parser.add_argument('--world-size', type=int, required=True) parser.add_argument('--layers-per-gpu', type=int, default=56) parser.add_argument('--block-type', choices=name_to_block.keys(), required=True) args = parser.parse_args() rpc.init_rpc(f"worker{args.rank}", rank=args.rank, world_size=args.world_size) if args.rank == 0: main(args) rpc.shutdown()
def new_test_method(self, *arg, **kwargs): # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted # in tests. import torch.distributed.rpc.api as api api._ignore_rref_leak = False self.worker_id = self.rank if setup_rpc: global _ALL_NODE_NAMES _ALL_NODE_NAMES = { "worker{}".format(rank) for rank in range(self.world_size) } rpc.init_rpc( name="worker%d" % self.rank, backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) return_value = old_test_method(self, *arg, **kwargs) if setup_rpc: if clean_shutdown: # Follower reports done. if self.rank == MASTER_RANK: on_master_follower_report_done( "worker{}".format(MASTER_RANK)) else: rpc.rpc_async( "worker{}".format(MASTER_RANK), on_master_follower_report_done, args=("worker{}".format(self.rank), ), ) # Master waits for followers to report done. # Follower waits for master's termination command. _TERMINATION_SIGNAL.wait() if self.rank == MASTER_RANK: # Master sends termination command. futs = [] for dst_rank in range(self.world_size): # torch.distributed.rpc module does not support sending to self. if dst_rank == MASTER_RANK: continue dst_name = "worker{}".format(dst_rank) fut = rpc.rpc_async(dst_name, set_termination_signal, args=()) futs.append(fut) for fut in futs: assert fut.wait( ) is None, "Sending termination signal failed." # Close RPC. Need to do this even if we don't have a clean shutdown # since we need to shutdown the RPC agent. If we don't shutdown the # RPC agent, tests would fail since RPC agent threads, locks and # condition variables are not properly terminated. rpc.wait_all_workers() return return_value
def run_parameter_server(rank, world_size): print(f"PS master initializing RPC, rank {rank}, world size {world_size}") rpc.init_rpc(name="parameter_server", rank=rank, world_size=world_size) print("Parameter server done initializing RPC") rpc.shutdown() print("RPC shutdown on parameter server")
def init_rpc_connection(self, global_rank: int, world_size: int) -> None: os.environ['MASTER_PORT'] = os.getenv('RPC_MASTER_PORT', '15000') rpc.init_rpc(f"worker{global_rank}", rank=global_rank, world_size=world_size) self.rpc_initialized = True
def run_worker(rank, world_size): print(f"Worker initializing RPC, rank {rank}, world size {world_size}") rpc.init_rpc(name=f"trainer_{rank}", rank=rank, world_size=world_size) print(f"Worker {rank} done initializing RPC") rpc.shutdown() print(f"RPC shutdown on Worker {rank}.")
def __init__( self, name: str, rank: int = -1, world_size: int = -1, init_dist: bool = True, init_rpc: bool = True, dist_backend: str = "gloo", dist_init_method: str = "tcp://localhost:9100", rpc_init_method: str = "tcp://localhost:9101", dist_timeout: float = 60, rpc_timeout: float = 60, ): """ Args: name: A unique name to identify current process. rank: A unique rank of the current process. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` world_size: Size of the distributed world. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` dist_timeout: Distributed package timeout in seconds. rpc_timeout: Global rpc call timeout in seconds. """ self.world_size = world_size self.rank = rank self.name = name self.groups = {} self.group_create_signals = {} if init_dist: dist.init_process_group( backend=dist_backend, init_method=dist_init_method, timeout=timedelta(seconds=dist_timeout), rank=rank, world_size=world_size, ) if init_rpc: rpc.init_rpc( self.name, rank=rank, world_size=world_size, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=rpc_init_method, rpc_timeout=rpc_timeout ), ) # get rank-name mapping self.rank_name_map = {} for wi in rpc._get_current_rpc_agent().get_worker_infos(): self.rank_name_map[wi.id] = wi.name # Start role dispatching. self.started = True self.rpc_timeout = rpc_timeout # map for paired values and registered services self.value_lut = {} self.service_lut = {} self.lut_lock = Lock() self.lut_manager = self.rank_name_map[0]
def run_worker(rank, world_size): ###################################################################### # Load and batch data # ------------------- # ###################################################################### # The training process uses Wikitext-2 dataset from ``torchtext``. The # vocab object is built based on the train dataset and is used to numericalize # tokens into tensors. Starting from sequential data, the ``batchify()`` # function arranges the dataset into columns, trimming off any tokens remaining # after the data has been divided into batches of size ``batch_size``. # For instance, with the alphabet as the sequence (total length of 26) # and a batch size of 4, we would divide the alphabet into 4 sequences of # length 6: # # .. math:: # \begin{bmatrix} # \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z} # \end{bmatrix} # \Rightarrow # \begin{bmatrix} # \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} & # \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} & # \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} & # \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix} # \end{bmatrix} # # These columns are treated as independent by the model, which means that # the dependence of ``G`` and ``F`` can not be learned, but allows more # efficient batch processing. # # In 'run_worker' def print_with_rank(msg): print('[RANK {}]: {}'.format(rank, msg)) import io from torchtext.utils import download_from_url, extract_archive from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip' test_filepath, valid_filepath, train_filepath = extract_archive( download_from_url(url, root=".data{}".format(rank))) tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator( map(tokenizer, iter(io.open(train_filepath, encoding="utf8")))) def data_process(raw_text_iter): data = [ torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in raw_text_iter ] return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) train_data = data_process(iter(io.open(train_filepath, encoding="utf8"))) val_data = data_process(iter(io.open(valid_filepath, encoding="utf8"))) test_data = data_process(iter(io.open(test_filepath, encoding="utf8"))) device = torch.device(2 * rank) def batchify(data, bsz, rank, world_size, is_train=False): # Divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() # Divide the data across the ranks only for training data. if is_train: data_per_rank = data.size(0) // world_size data = data[rank * data_per_rank:(rank + 1) * data_per_rank] return data.to(device) batch_size = 20 eval_batch_size = 10 train_data = batchify(train_data, batch_size, rank, world_size, True) val_data = batchify(val_data, eval_batch_size, rank, world_size) test_data = batchify(test_data, eval_batch_size, rank, world_size) ###################################################################### # Functions to generate input and target sequence # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ###################################################################### # ``get_batch()`` function generates the input and target sequence for # the transformer model. It subdivides the source data into chunks of # length ``bptt``. For the language modeling task, the model needs the # following words as ``Target``. For example, with a ``bptt`` value of 2, # we’d get the following two Variables for ``i`` = 0: # # .. image:: ../_static/img/transformer_input_target.png # # It should be noted that the chunks are along dimension 0, consistent # with the ``S`` dimension in the Transformer model. The batch dimension # ``N`` is along dimension 1. # # In 'run_worker' bptt = 35 def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) return data, target ###################################################################### # Model scale and Pipe initialization # ----------------------------------- # ###################################################################### # To demonstrate training large Transformer models using pipeline parallelism, # we scale up the Transformer layers appropriately. We use an embedding # dimension of 4096, hidden size of 4096, 16 attention heads and 8 total # transformer layers (``nn.TransformerEncoderLayer``). This creates a model with # **~1 billion** parameters. # # We need to initialize the `RPC Framework <https://pytorch.org/docs/stable/rpc.html>`__ # since Pipe depends on the RPC framework via `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ # which allows for future expansion to cross host pipelining. We need to # initialize the RPC framework with only a single worker since we're using a # single process to drive multiple GPUs. # # The pipeline is then initialized with 8 transformer layers on one GPU and 8 # transformer layers on the other GPU. One pipe is setup across GPUs 0 and 1 and # another across GPUs 2 and 3. Both pipes are then replicated using DistributedDataParallel. # In 'run_worker' ntokens = len(vocab.stoi) # the size of vocabulary emsize = 4096 # embedding dimension nhid = 4096 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 8 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 16 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from torch.distributed import rpc tmpfile = tempfile.NamedTemporaryFile() rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), # Specifying _transports and _channels is a workaround and we no longer # will have to specify _transports and _channels for PyTorch # versions >= 1.8.1 _transports=["ibv", "uv"], _channels=["cuda_ipc", "cuda_basic"], )) # Num gpus for model parallelism. num_gpus = 2 partition_len = ((nlayers - 1) // num_gpus) + 1 # Add encoder in the beginning. tmp_list = [Encoder(ntokens, emsize, dropout).cuda(2 * rank)] module_list = [] # Add all the necessary transformer blocks. for i in range(nlayers): transformer_block = TransformerEncoderLayer(emsize, nhead, nhid, dropout) if i != 0 and i % (partition_len) == 0: module_list.append(nn.Sequential(*tmp_list)) tmp_list = [] device = i // (partition_len) tmp_list.append(transformer_block.to(2 * rank + device)) # Add decoder in the end. tmp_list.append(Decoder(ntokens, emsize).cuda(2 * rank + num_gpus - 1)) module_list.append(nn.Sequential(*tmp_list)) # Need to use 'checkpoint=never' since as of PyTorch 1.8, Pipe checkpointing # doesn't work with DDP. from torch.distributed.pipeline.sync import Pipe model = Pipe(torch.nn.Sequential(*module_list), chunks=8, checkpoint="never") # Initialize process group and wrap model in DDP. from torch.nn.parallel import DistributedDataParallel import torch.distributed as dist os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) model = DistributedDataParallel(model) def get_total_params(module: torch.nn.Module): total_params = 0 for param in module.parameters(): total_params += param.numel() return total_params print_with_rank('Total parameters in model: {:,}'.format( get_total_params(model))) ###################################################################### # Run the model # ------------- # ###################################################################### # `CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__ # is applied to track the loss and # `SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__ # implements stochastic gradient descent method as the optimizer. The initial # learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is # applied to adjust the learn rate through epochs. During the # training, we use # `nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__ # function to scale all the gradient together to prevent exploding. # # In 'run_worker' criterion = nn.CrossEntropyLoss() lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) import time def train(): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(vocab.stoi) # Train only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, train_data.size(0) - 1) for batch, i in enumerate(range(0, nbatches, bptt)): data, targets = get_batch(train_data, i) optimizer.zero_grad() # Since the Pipe is only within a single host and process the ``RRef`` # returned by forward method is local to this node and can simply # retrieved via ``RRef.local_value()``. output = model(data).local_value() # Need to move targets to the device where the output of the # pipeline resides. loss = criterion(output.view(-1, ntokens), targets.cuda(2 * rank + 1)) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() total_loss += loss.item() log_interval = 10 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print_with_rank('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, nbatches // bptt, scheduler.get_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() def evaluate(eval_model, data_source): eval_model.eval() # Turn on the evaluation mode total_loss = 0. ntokens = len(vocab.stoi) # Evaluate only for 50 batches to keep script execution time low. nbatches = min(50 * bptt, data_source.size(0) - 1) with torch.no_grad(): for i in range(0, nbatches, bptt): data, targets = get_batch(data_source, i) output = eval_model(data).local_value() output_flat = output.view(-1, ntokens) # Need to move targets to the device where the output of the # pipeline resides. total_loss += len(data) * criterion( output_flat, targets.cuda(2 * rank + 1)).item() return total_loss / (len(data_source) - 1) ###################################################################### # Loop over epochs. Save the model if the validation loss is the best # we've seen so far. Adjust the learning rate after each epoch. # In 'run_worker' best_val_loss = float("inf") epochs = 3 # The number of epochs best_model = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train() val_loss = evaluate(model, val_data) print_with_rank('-' * 89) print_with_rank( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print_with_rank('-' * 89) if val_loss < best_val_loss: best_val_loss = val_loss best_model = model scheduler.step() ###################################################################### # Evaluate the model with the test dataset # ------------------------------------- # # Apply the best model to check the result with the test dataset. # In 'run_worker' test_loss = evaluate(best_model, test_data) print_with_rank('=' * 89) print_with_rank( '| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print_with_rank('=' * 89)
def worker_loop(a): rpc.init_rpc('worker1', rank=1, world_size=2) rpc.shutdown()