def run(rank, world_size): torch_pg.init_mpi() os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "10638" dist_init(rank, world_size) # FIXME (supports gloo) os.environ["MASTER_PORT"] = "10639" torch.distributed.rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size) initialize_model_parallel(1, world_size, pipeline_backend="mpi") if rank == 1: # For RPC, all ranks other than 0 just need to call rpc.shutdown() torch.distributed.rpc.shutdown() return model = getModel() data, target = getData()[0] loss_fn = getLossFun() device = torch.device("cuda", rank) model = fairscale.nn.PipeRPCWrapper( model, balance=[2, 1], worker_map={ 0: "worker0", 1: "worker1" }, # Needed to convert ranks to RPC worker names input_device=device, ).to(device) # We can't directly access the model on each worker, so we need to call # foreach_worker with a callback to setup the optimizer model.foreach_worker(register_optimizer, {"lr": 0.001}, include_self=True) outputs = model(data.to(device)) loss = loss_fn(outputs.to(device), target.to(device)) loss.backward() # Same as earlier, use foreach_worker to step the optimizer on each rank model.foreach_worker(run_optimizer, include_self=True) print(f"Finished Training Step on {rank}") torch.distributed.rpc.shutdown() del model
def run(rank, world_size): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "10638" dist_init(rank, world_size) os.environ["MASTER_PORT"] = "10639" dist.rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size) initialize_model_parallel(1, world_size) model = getModel() data, target = getData()[0] loss_fn = getLossFun() device = torch.device("cuda", rank) if DEVICE == "cuda" else torch.device("cpu") model = fairscale.nn.Pipe( model, balance=[2, 1], style=fairscale.nn.Pipe.MultiProcess, worker_map={ 0: "worker0", 1: "worker1" }, # Needed to convert ranks to RPC worker names input_device=device, ).to(device) # define optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=0.001) # zero the parameter gradients optimizer.zero_grad() # outputs and target need to be on the same device # forward step outputs = model(data.to(device)) # compute loss if rank == 1: loss = loss_fn(outputs.to(device), target.to(device)) # backward + optimize loss.backward() optimizer.step() else: model.back_helper(outputs) print(f"Finished Training Step on {rank}") del model
from helpers import getData, getLossFun, getModel import torch import torch.optim as optim import fairscale DEVICE = "cuda" if torch.cuda.is_available() else "cpu" RANK = 0 # example model = getModel() data, target = getData()[0] loss_fn = getLossFun() model = fairscale.nn.Pipe(model, balance=[2, 1]) # define optimizer and loss function optimizer = optim.SGD(model.parameters(), lr=0.001) # zero the parameter gradients optimizer.zero_grad() device = torch.device("cuda", RANK) if DEVICE == "cuda" else torch.device("cpu") # outputs and target need to be on the same device # forward step outputs = model(data.to(device).requires_grad_()) # compute loss loss = loss_fn(outputs.to(device), target.to(device)) # backward + optimize
transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # Transform images train_data = datasets.ImageFolder(train_dir, transform=train_transforms) test_data = datasets.ImageFolder(test_dir, transform=test_transforms) # Load images trainloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True) testloader = torch.utils.data.DataLoader(test_data, batch_size=32) # Load model model = helpers.getModel(args.arch, input_size, output_size, args.hidden_units, train_data.class_to_idx) model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.classifier.parameters(), lr=args.learning_rate, momentum=0.9) scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) model = helpers.trainModel(device, args.epochs, model, train_data, trainloader, testloader, optimizer, criterion, scheduler) torch.save( { 'arch': args.arch, 'input_size': input_size,
def train(rank: int, world_size: int, epochs: int, use_oss: bool): # DDP dist_init(rank, world_size) rank = torch.device("cpu") if DEVICE == "cpu" else rank # Problem statement model = getModel().to(rank) dataloader = getData(n_batches=1) loss_fn = getLossFun() optimizer: Optional[Union[OSS, torch.optim.SGD]] = None if not use_oss: optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-4) else: base_optimizer = torch.optim.SGD base_optimizer_arguments = { "lr": 1e-4 } # any optimizer specific arguments, LR, momentum, etc... optimizer = OSS(params=model.parameters(), optim=base_optimizer, **base_optimizer_arguments) training_start = time.monotonic() # Any relevant training loop, nothing specific to OSS. For example: model.train() for _ in range(epochs): for (data, target) in dataloader: data, target = data.to(rank), target.to(rank) # Train model.zero_grad() outputs = model(data) loss = loss_fn(outputs, target) loss.backward() # if you want to clip the gradients / get the current max: max_norm = 1000.0 norm_type = 1 if not use_oss: _total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), max_norm, norm_type=norm_type) # type: ignore else: optimizer = cast(OSS, optimizer) _total_norm = optimizer.clip_grad_norm(max_norm, norm_type=norm_type) optimizer.step() print(f"Loss: {loss.item()}") training_end = time.monotonic() print( f"[{dist.get_rank()}] : Training done. {training_end-training_start:.2f} sec" ) if DEVICE == "cuda": max_memory = torch.cuda.max_memory_allocated(rank) print(f"[{dist.get_rank()}] : Peak memory {max_memory:.1f}MiB")