def infiniband2(): if torch.distributed.get_rank() == 0: t = torch.Tensor(range(100)).cuda() torch.distributed.send(t, 1, group=get_pipeline_parallel_group()) else: t = torch.empty(100).cuda() torch.distributed.recv(t, 0, group=get_pipeline_parallel_group()) assert torch.equal(t, torch.Tensor(range(100)).cuda()) print(f"t on {torch.distributed.get_rank()} is {t}")
def run_mp_worker(args, available_workers): new_data = True blob = make_model_and_data(args, None, new_data=new_data) model = blob["model"] balance = generate_balance(get_pipeline_parallel_group().size(), len(model)) p = pipe.AMPnetPipe( module=model, balance=balance, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), checkpoint=args.checkpoint, ) if torch.cuda.is_available(): p = p.cuda() if new_data: train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args) else: ntokens, train_data, val_data, test_data = blob["data"] benchmark_language_model(train_data, val_data, test_data, p, criterion, optimizer, ntokens, args)
def run_mp_worker(args, available_workers): benchmark_config = create_benchmark_config(args.model_name) model_config = create_model_config(args, config=benchmark_config) model = model_config["model"] balance = generate_balance_weighted(get_pipeline_parallel_group().size(), len(model), 0.8) pipe_model = MultiProcessPipe( model, balance, style=MultiProcessPipe.AsyncSchedule, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint, # TODO(anj-s): Do we need to comment this out? loss_fn=benchmark_config["criterion"], ) if torch.cuda.is_available(): pipe_model = pipe_model.cuda() if args.all_at_once and pipe_model.pipeline: print(f"running all at once") pipe_model.pipeline.all_at_once = True if args.use_synthetic_data: train(model_config, pipe_model, benchmark_config, args) else: benchmark_language_model(model_config, pipe_model, benchmark_config, args)
def run_mp_worker(args, available_workers): new_data = True blob = make_model_and_data(args, None, new_data=new_data) model = blob["model"] balance = generate_balance_weighted(get_pipeline_parallel_group().size(), len(model), 0.8) p = pipe.Pipe( model, balance, style=Pipe.AsyncSchedule, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), pipelined_backward=args.pipelined_backward, checkpoint=args.checkpoint, # loss_fn=blob["criterion"], ) if torch.cuda.is_available(): p = p.cuda() if args.all_at_once and p.pipeline: print(f"running all at once") p.pipeline.all_at_once = True if new_data: train(blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args) else: ntokens, train_data, val_data, test_data = blob["data"] benchmark_language_model(train_data, val_data, test_data, p, criterion, optimizer, ntokens, args)
def run_mp_worker(args, available_workers): benchmark_config = create_benchmark_config(args.model_name) model_specs = get_model_specs(args.model_name) model_config = create_model_config(args, benchmark_config=benchmark_config, model_specs=model_specs) model = model_config["model"] balance = generate_balance(get_pipeline_parallel_group().size(), len(model)) pipe_model = MultiProcessPipe( model, balance, chunks=args.chunks, worker_map=get_worker_map(), input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), checkpoint=args.checkpoint, # TODO(anj-s): Do we need to comment this out? loss_fn=benchmark_config["criterion"], ) if torch.cuda.is_available(): pipe_model = pipe_model.cuda() if args.dry_run: train(model_config, pipe_model, benchmark_config, model_specs, args) else: benchmark_language_model(model_config, pipe_model, benchmark_config, model_specs, args)
def rpc_megatron_reuse(): from fairscale.nn.model_parallel import layers from fairscale.nn.model_parallel.initialize import destroy_model_parallel, initialize_model_parallel def make_model_simple(): return [ layers.ColumnParallelLinear(10, 10), nn.ReLU(), layers.RowParallelLinear(10, 10), nn.ReLU(), layers.ColumnParallelLinear(10, 10), nn.ReLU(), layers.RowParallelLinear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), ] def make_model_with_reuse(): column = layers.ColumnParallelLinear(10, 10) row = layers.RowParallelLinear(10, 10) return [ column, nn.ReLU(), row, nn.ReLU(), column, nn.ReLU(), row, nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), ] destroy_model_parallel() torch.distributed.destroy_process_group() torch.distributed.init_process_group("gloo", rank=int(os.environ["RANK"]), world_size=int( os.environ["WORLD_SIZE"])) initialize_model_parallel(2, 3, model_parallel_backend="nccl", pipeline_backend="mpi") init_rpc() if get_pipeline_parallel_group().rank() != 0: rpc.shutdown() torch.distributed.barrier() return check_pipe_against_reference([4, 4, 2], make_model_simple, "always") check_pipe_against_reference([4, 2, 2], make_model_with_reuse) rpc.shutdown() torch.distributed.barrier()
def callback_with_model(callback: Callable[[Any, MultiProcessPipe], None], ctx: Any) -> None: try: group = get_pipeline_parallel_group() # FIXME(tom) handle dynamic group set_device_based_on_group(group) with PipeModel.lock: callback(ctx, PipeModel) except Exception as e: print(f"callback_with_model got {e}")
def _register_remote_model(args: List[Any], kwargs: Dict[str, Any]) -> None: group = get_pipeline_parallel_group() # FIXME(tom) handle dynamic group set_device_based_on_group(group) kwargs["group"] = group kwargs["input_device"] = torch.device("cuda", torch.cuda.current_device()) model = MultiProcessPipe(*args, **kwargs) model.cuda() global PipeModel PipeModel = model
def __init__(self, *args: Any, **kwargs: Any): super().__init__() self.group = cast(ProcessGroup, kwargs.get("group")) or get_pipeline_parallel_group() assert self.group.rank() == 0 self.lock = Lock() if True: assert ( self.group == get_pipeline_parallel_group() ), "Can't pickle groups, so group must be `get_pipeline_parallel_group()`" kwargs["group"] = None else: kwargs["group"] = self.group kwargs["style"] = MultiProcessPipe.AsyncSchedule kwargs["input_device"] = torch.device("cuda", torch.cuda.current_device()) self.model = MultiProcessPipe(*args, **kwargs) self.worker_map = kwargs["worker_map"] self._foreach_worker(self._register_remote_model, args=(args, kwargs)) self.model.cuda()
def _recv_result(model: Pipe, shapes: SizeOrSizes, dtypes: DtypeOrDtypes, message: PipeMessage) -> TensorOrTensors: group = get_pipeline_parallel_group() set_device_based_on_group(group) assert model.pipeline transport = model.pipeline.transport if isinstance(shapes, torch.Size): message.tensor_shapes = [cast(torch.Size, shapes)] message.tensor_dtypes = [cast(torch.dtype, dtypes)] message = transport.recv_message_tensors(message) return message.tensors[0] else: message.tensor_shapes = cast(List[torch.Size], shapes) message.tensor_dtypes = cast(List[torch.dtype], dtypes) message = transport.recv_message_tensors(message) return message.tensors
def _send_result_and_do_backwards(training: bool, message: PipeMessage, grads_message: PipeMessage) -> None: group = get_pipeline_parallel_group() set_device_based_on_group(group) result = PipeResult model = PipeModel if isinstance(result, torch.Tensor): result = tuple([result]) message.tensors = tuple(result) assert model.pipeline transport = model.pipeline.transport transport.send_message(message, sync=False, skip_header=True) if training: grads_message.tensor_shapes = [r.shape for r in result] grads_message.tensor_dtypes = [r.dtype for r in result] grads_message = transport.recv_message_tensors(grads_message) with model.lock: torch.autograd.backward(result, grads_message.tensors, retain_graph=True)
def run_test_pipe(rank, world_size, filename, filename_rpc, skip_dist_init=False): pipe_world_size = 2 if world_size == 1: return if not skip_dist_init: dist_init(rank, world_size, filename, filename_rpc) else: os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29502" rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size) mpu.initialize_model_parallel(world_size / pipe_world_size, pipe_world_size) model_parallel_size = mpu.get_model_parallel_world_size() if torch.distributed.get_rank() == 0: print( "> testing Sequential + MultiProcessPipe with model parallel size: {}, pipe: {}" .format(model_parallel_size, pipe_world_size)) chunk_size = 4 seed = 12345 set_random_seed(seed) input_size_coeff = 3 input_size = input_size_coeff * model_parallel_size output_size_coeff = 7 output_size = output_size_coeff * model_parallel_size batch_size = 3 * chunk_size target = torch.rand((batch_size, input_size), requires_grad=True).cuda() print(f"target = {target}") identity = IdentityLayer2D(batch_size, input_size).cuda() pipeline_devices = mpu.get_pipeline_parallel_group() set_random_seed(seed) model = nn.Sequential( layers.ColumnParallelLinear(input_size, output_size, keep_master_weight_for_test=True, bias=False).cuda(), nn.ReLU(), layers.RowParallelLinear(output_size, input_size, keep_master_weight_for_test=True, bias=False).cuda(), ) set_random_seed(seed) reference = [ nn.Linear(input_size, output_size, bias=False).cuda(), nn.ReLU(), nn.Linear(output_size, input_size, bias=False).cuda(), ] print( f"setup {reference[0].weight.size()}, {model[0].weight.size()}, {(input_size, output_size)}" ) print(f"setup {reference[2].weight.size()}, {(output_size, input_size)}") reference[0].weight = Parameter( model[0].get_master_weight().clone()).cuda() reference[2].weight = Parameter( model[2].get_master_weight().clone()).cuda() reference = nn.Sequential(*reference) def grad_graph(depth, grad): result = depth * " " + str(grad) if grad: for x in grad.next_functions: result += "\n" + grad_graph(depth + 1, x[0]) return result def check_weights(x, y, key: str, index=None): for i in [2, 0]: if index is not None and i != index: continue left = x[i].get_master_weight() right = y[i].weight.data if not torch.allclose(left, right, atol=1.0e-6) or index is not None: print( f"check_weights {key}-{i}: left = {left}, \nright = {right}" ) if not torch.equal(left, right): print( f"check_weights NOT_EQUAL {key}-{i}: left = {left}, \nright = {right}" ) assert torch.allclose(left, right, atol=1.0e-6) def dump_opt_params(opt): for i, group in enumerate(opt.param_groups): for j, p in enumerate(group["params"]): print(f"{torch.distributed.get_rank()}:param {(i,j)} = {p}") print( f"{torch.distributed.get_rank()}:param.grad {(i,j)} = {p.grad}" ) def forward_model(model_, target, step=False): optimizer = torch.optim.SGD(model_.parameters(), lr=0.01, momentum=0.9) optimizer.zero_grad() model_.zero_grad() output = model_(identity()) loss = nn.MSELoss() model_.zero_grad() if step: loss(output, target).backward() saved_weight_0 = model_[0].weight.data.clone() saved_weight_2 = model_[2].weight.data.clone() dump_opt_params(optimizer) optimizer.step() assert not torch.allclose( saved_weight_0, model_[0].weight.data, atol=1.0e-6) assert not torch.allclose( saved_weight_2, model_[2].weight.data, atol=1.0e-6) return output output = forward_model(model, target) reference_output = forward_model(reference, target) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 output = forward_model(model, target) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 output = forward_model(model, target) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 check_weights(model, reference, "before") saved_weight_0 = model[0].weight.data.clone() saved_weight_2 = model[2].weight.data.clone() output = forward_model(model, target, step=True) error = reference_output.sub(output).max() assert error < 1.0e-6 model[0].weight.data = saved_weight_0 model[2].weight.data = saved_weight_2 worker_map = { i: f"Test{i}" for i in range(torch.distributed.get_world_size()) } if pipe_world_size == 2: print("actually doing pipe stuff now") assert torch.equal(saved_weight_0, model[0].weight.data) assert torch.equal(saved_weight_2, model[2].weight.data) pipe_model = MultiProcessPipe( model, [2, 1], group=pipeline_devices, worker_map=worker_map, input_device=torch.cuda.current_device(), chunks=chunk_size, ).cuda() torch.distributed.barrier() pipe_rank = torch.distributed.get_rank( group=mpu.get_pipeline_parallel_group()) print(f"pipe rank is {pipe_rank}") if pipe_rank == 0: assert torch.equal(saved_weight_0, pipe_model[0].weight.data) else: if not torch.equal(saved_weight_2, pipe_model[0].weight.data): print( f"ne {pipe_rank}: left\n{saved_weight_2}\nright:\n{pipe_model[0].weight.data}" ) assert torch.equal(saved_weight_2, pipe_model[0].weight.data) optimizer = torch.optim.SGD(pipe_model.parameters(), lr=0.01, momentum=0.9) optimizer.zero_grad() if pipe_rank == 0: assert torch.equal(saved_weight_0, pipe_model[0].weight.data) print(f"runner {rank}:\n{pipe_model[0].weight.data}") else: assert torch.equal(saved_weight_2, pipe_model[0].weight.data) print(f"runner {rank}:\n{pipe_model[0].weight.data}") if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1: check_weights(model, reference, "pre-pipe", index=2) else: check_weights(model, reference, "pre-pipe", index=0) pipe_output = pipe_model(identity()) print(f"exited pipe for {rank}") forward_model(reference, target, step=True) print(f"pipe_output {rank} = {pipe_output}") print(f"reference_output {rank} = {reference_output}") torch.distributed.barrier() if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1: error = reference_output.sub(pipe_output.cuda()).max() if error >= 1.0e-6: print(f"error bad {error}") assert error < 1.0e-6 loss = nn.MSELoss() failed = False pipe_output.retain_grad() with torch.autograd.profiler.profile() as prof: try: loss(pipe_output, target).backward() except Exception as e: failed = True print(f"got {e} while doing backward, deadlock?") if failed: raise RuntimeError("failed somehow") dump_opt_params(optimizer) optimizer.step() print("calling check_weights on master") check_weights(model, reference, "pipe", index=2) print(f"waiting for barrier on master, pid={os.getpid()}") else: print(f"calling backwards on slave, pid={os.getpid()}") failed = False with torch.autograd.profiler.profile() as prof: try: pipe_model.back_helper(pipe_output) except Exception as e: failed = True print(f"got {e} while doing backward, deadlock?") if failed: raise RuntimeError("failed somehow") dump_opt_params(optimizer) print("calling step on slave") optimizer.step() print("calling check_weights on slave") check_weights(model, reference, "pipe", index=0) print("waiting for barrier on slave") pipe_model.zero_grad() torch.distributed.barrier() pipe_model.eval() pipe_output = pipe_model(identity()) updated_ref_output = forward_model(reference, target) if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1: error = updated_ref_output.sub(pipe_output.cuda()).max() print( f"outputs are ref:\n{updated_ref_output}\npipe:\n{pipe_output}" ) assert error < 1.0e-6 torch.distributed.barrier() print(f"finished waiting for barrier on, pid={os.getpid()}") print(f"really exited pipe for {rank}") rpc.shutdown() torch.distributed.destroy_process_group()
def run_test_pipe(rank, model_parallel_size): pipe_world_size = 2 dist_init(rank, model_parallel_size) mpu.initialize_model_parallel(model_parallel_size) if torch.distributed.get_rank() == 0: print( "> testing Sequential + Pipe with model parallel size: {}, pipe: {}" .format(model_parallel_size, pipe_world_size)) model_parallel_size = mpu.get_model_parallel_world_size() chunk_size = 8 seed = 12345 set_random_seed(seed) input_size_coeff = 13 input_size = input_size_coeff * model_parallel_size output_size_coeff = 17 output_size = output_size_coeff * model_parallel_size batch_size = 7 * chunk_size identity = IdentityLayer2D(batch_size, input_size).cuda() pipeline_devices = mpu.get_pipeline_parallel_group() if pipe_world_size == 2 and len(pipeline_devices) == 1: pipeline_devices.append(pipeline_devices[0] + model_parallel_size) set_random_seed(seed) model = nn.Sequential( layers.ColumnParallelLinear(input_size, output_size, keep_master_weight_for_test=True, bias=False).cuda(), nn.ReLU(), layers.RowParallelLinear(output_size, input_size, keep_master_weight_for_test=True, bias=False).cuda(), ) set_random_seed(seed) reference = nn.Sequential( nn.Linear(input_size, output_size, bias=False).cuda(), nn.ReLU(), nn.Linear(output_size, input_size, bias=False).cuda(), ) reference[0].weight.data = model[0].master_weight.cuda() reference[-1].weight.data = model[-1].master_weight.cuda() loss_weight = torch.randn([batch_size, output_size]).cuda() output = model(identity()) reference_output = reference(identity()) error = reference_output.sub(output).max() torch.distributed.barrier() assert error < 1.0e-6 if pipe_world_size == 2: pipe_model = Pipe(model, [2, 1], devices=pipeline_devices, chunks=chunk_size) torch.distributed.barrier() pipe_output = pipe_model(identity()) error = reference_output.sub(pipe_output.cuda()).max() torch.distributed.barrier() assert error < 1.0e-6