def run(rank, world_size): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "10638" dist_init(rank, world_size) os.environ["MASTER_PORT"] = "10639" dist.rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size) initialize_model_parallel(1, world_size) model = get_model() data, target = get_data()[0] loss_fn = get_loss_fun() device = torch.device("cuda", rank) if DEVICE == "cuda" else torch.device("cpu") model = MultiProcessPipe( model, balance=[2, 1], style=MultiProcessPipe.MultiProcess, worker_map={ 0: "worker0", 1: "worker1" }, # Needed to convert ranks to RPC worker names input_device=device, ).to(device) # define optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=0.001) # zero the parameter gradients optimizer.zero_grad() # outputs and target need to be on the same device # forward step outputs = model(data.to(device)) # compute loss if rank == 1: loss = loss_fn(outputs.to(device), target.to(device)) # backward + optimize loss.backward() optimizer.step() else: model.back_helper(outputs) print(f"Finished Training Step on {rank}") dist.rpc.shutdown() del model
def run(rank, world_size): torch_pg.init_mpi() os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "10638" dist_init(rank, world_size) # FIXME (supports gloo) os.environ["MASTER_PORT"] = "10639" torch.distributed.rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size) initialize_model_parallel(1, world_size, pipeline_backend="mpi") if rank == 1: # For RPC, all ranks other than 0 just need to call rpc.shutdown() torch.distributed.rpc.shutdown() return model = getModel() data, target = getData()[0] loss_fn = getLossFun() device = torch.device("cuda", rank) model = fairscale.nn.PipeRPCWrapper( model, balance=[2, 1], worker_map={ 0: "worker0", 1: "worker1" }, # Needed to convert ranks to RPC worker names input_device=device, ).to(device) # We can't directly access the model on each worker, so we need to call # foreach_worker with a callback to setup the optimizer model.foreach_worker(register_optimizer, {"lr": 0.001}, include_self=True) outputs = model(data.to(device)) loss = loss_fn(outputs.to(device), target.to(device)) loss.backward() # Same as earlier, use foreach_worker to step the optimizer on each rank model.foreach_worker(run_optimizer, include_self=True) print(f"Finished Training Step on {rank}") torch.distributed.rpc.shutdown() del model
def train(rank: int, world_size: int, epochs: int, use_oss: bool): # DDP dist_init(rank, world_size) rank = torch.device("cpu") if DEVICE == "cpu" else rank # Problem statement model = getModel().to(rank) dataloader = getData(n_batches=1) loss_fn = getLossFun() optimizer: Optional[Union[OSS, torch.optim.SGD]] = None if not use_oss: optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-4) else: base_optimizer = torch.optim.SGD base_optimizer_arguments = { "lr": 1e-4 } # any optimizer specific arguments, LR, momentum, etc... optimizer = OSS(params=model.parameters(), optim=base_optimizer, **base_optimizer_arguments) training_start = time.monotonic() # Any relevant training loop, nothing specific to OSS. For example: model.train() for _ in range(epochs): for (data, target) in dataloader: data, target = data.to(rank), target.to(rank) # Train model.zero_grad() outputs = model(data) loss = loss_fn(outputs, target) loss.backward() # if you want to clip the gradients / get the current max: max_norm = 1000.0 norm_type = 1 if not use_oss: _total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), max_norm, norm_type=norm_type) # type: ignore else: optimizer = cast(OSS, optimizer) _total_norm = optimizer.clip_grad_norm(max_norm, norm_type=norm_type) optimizer.step() print(f"Loss: {loss.item()}") training_end = time.monotonic() print( f"[{dist.get_rank()}] : Training done. {training_end-training_start:.2f} sec" ) if DEVICE == "cuda": max_memory = torch.cuda.max_memory_allocated(rank) print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB")