Exemplo n.º 1
0
def infiniband2():
    if torch.distributed.get_rank() == 0:
        t = torch.Tensor(range(100)).cuda()
        torch.distributed.send(t, 1, group=get_pipeline_parallel_group())
    else:
        t = torch.empty(100).cuda()
        torch.distributed.recv(t, 0, group=get_pipeline_parallel_group())

    assert torch.equal(t, torch.Tensor(range(100)).cuda())
    print(f"t on {torch.distributed.get_rank()} is {t}")
Exemplo n.º 2
0
def run_mp_worker(args, available_workers):
    new_data = True

    blob = make_model_and_data(args, None, new_data=new_data)
    model = blob["model"]

    balance = generate_balance(get_pipeline_parallel_group().size(),
                               len(model))
    p = pipe.AMPnetPipe(
        module=model,
        balance=balance,
        chunks=args.chunks,
        worker_map=get_worker_map(),
        input_device=torch.device("cuda")
        if torch.cuda.is_available() else torch.device("cpu"),
        checkpoint=args.checkpoint,
    )
    if torch.cuda.is_available():
        p = p.cuda()

    if new_data:
        train(blob["data"], p, blob["criterion"], blob["optimizer"],
              blob["vocab_size"], args)
    else:
        ntokens, train_data, val_data, test_data = blob["data"]
        benchmark_language_model(train_data, val_data, test_data, p, criterion,
                                 optimizer, ntokens, args)
Exemplo n.º 3
0
def run_mp_worker(args, available_workers):

    benchmark_config = create_benchmark_config(args.model_name)
    model_config = create_model_config(args, config=benchmark_config)
    model = model_config["model"]

    balance = generate_balance_weighted(get_pipeline_parallel_group().size(), len(model), 0.8)
    pipe_model = MultiProcessPipe(
        model,
        balance,
        style=MultiProcessPipe.AsyncSchedule,
        chunks=args.chunks,
        worker_map=get_worker_map(),
        input_device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"),
        pipelined_backward=args.pipelined_backward,
        checkpoint=args.checkpoint,
        # TODO(anj-s): Do we need to comment this out? loss_fn=benchmark_config["criterion"],
    )
    if torch.cuda.is_available():
        pipe_model = pipe_model.cuda()
    if args.all_at_once and pipe_model.pipeline:
        print(f"running all at once")
        pipe_model.pipeline.all_at_once = True

    if args.use_synthetic_data:
        train(model_config, pipe_model, benchmark_config, args)
    else:
        benchmark_language_model(model_config, pipe_model, benchmark_config, args)
Exemplo n.º 4
0
def run_mp_worker(args, available_workers):
    new_data = True

    blob = make_model_and_data(args, None, new_data=new_data)
    model = blob["model"]

    balance = generate_balance_weighted(get_pipeline_parallel_group().size(),
                                        len(model), 0.8)
    p = pipe.Pipe(
        model,
        balance,
        style=Pipe.AsyncSchedule,
        chunks=args.chunks,
        worker_map=get_worker_map(),
        input_device=torch.device("cuda")
        if torch.cuda.is_available() else torch.device("cpu"),
        pipelined_backward=args.pipelined_backward,
        checkpoint=args.checkpoint,
        # loss_fn=blob["criterion"],
    )
    if torch.cuda.is_available():
        p = p.cuda()
    if args.all_at_once and p.pipeline:
        print(f"running all at once")
        p.pipeline.all_at_once = True

    if new_data:
        train(blob["data"], p, blob["criterion"], blob["optimizer"],
              blob["vocab_size"], args)
    else:
        ntokens, train_data, val_data, test_data = blob["data"]
        benchmark_language_model(train_data, val_data, test_data, p, criterion,
                                 optimizer, ntokens, args)
Exemplo n.º 5
0
def run_mp_worker(args, available_workers):

    benchmark_config = create_benchmark_config(args.model_name)
    model_specs = get_model_specs(args.model_name)
    model_config = create_model_config(args,
                                       benchmark_config=benchmark_config,
                                       model_specs=model_specs)
    model = model_config["model"]

    balance = generate_balance(get_pipeline_parallel_group().size(),
                               len(model))
    pipe_model = MultiProcessPipe(
        model,
        balance,
        chunks=args.chunks,
        worker_map=get_worker_map(),
        input_device=torch.device("cuda")
        if torch.cuda.is_available() else torch.device("cpu"),
        checkpoint=args.checkpoint,
        # TODO(anj-s): Do we need to comment this out? loss_fn=benchmark_config["criterion"],
    )
    if torch.cuda.is_available():
        pipe_model = pipe_model.cuda()

    if args.dry_run:
        train(model_config, pipe_model, benchmark_config, model_specs, args)
    else:
        benchmark_language_model(model_config, pipe_model, benchmark_config,
                                 model_specs, args)
Exemplo n.º 6
0
def rpc_megatron_reuse():

    from fairscale.nn.model_parallel import layers
    from fairscale.nn.model_parallel.initialize import destroy_model_parallel, initialize_model_parallel

    def make_model_simple():
        return [
            layers.ColumnParallelLinear(10, 10),
            nn.ReLU(),
            layers.RowParallelLinear(10, 10),
            nn.ReLU(),
            layers.ColumnParallelLinear(10, 10),
            nn.ReLU(),
            layers.RowParallelLinear(10, 10),
            nn.ReLU(),
            nn.Linear(10, 10),
            nn.ReLU(),
        ]

    def make_model_with_reuse():
        column = layers.ColumnParallelLinear(10, 10)
        row = layers.RowParallelLinear(10, 10)
        return [
            column,
            nn.ReLU(),
            row,
            nn.ReLU(),
            column,
            nn.ReLU(),
            row,
            nn.ReLU(),
            nn.Linear(10, 10),
            nn.ReLU(),
        ]

    destroy_model_parallel()
    torch.distributed.destroy_process_group()
    torch.distributed.init_process_group("gloo",
                                         rank=int(os.environ["RANK"]),
                                         world_size=int(
                                             os.environ["WORLD_SIZE"]))
    initialize_model_parallel(2,
                              3,
                              model_parallel_backend="nccl",
                              pipeline_backend="mpi")

    init_rpc()
    if get_pipeline_parallel_group().rank() != 0:
        rpc.shutdown()
        torch.distributed.barrier()
        return

    check_pipe_against_reference([4, 4, 2], make_model_simple, "always")
    check_pipe_against_reference([4, 2, 2], make_model_with_reuse)

    rpc.shutdown()
    torch.distributed.barrier()
Exemplo n.º 7
0
def callback_with_model(callback: Callable[[Any, MultiProcessPipe], None], ctx: Any) -> None:
    try:
        group = get_pipeline_parallel_group()  # FIXME(tom) handle dynamic group
        set_device_based_on_group(group)

        with PipeModel.lock:
            callback(ctx, PipeModel)
    except Exception as e:
        print(f"callback_with_model got {e}")
Exemplo n.º 8
0
 def _register_remote_model(args: List[Any], kwargs: Dict[str, Any]) -> None:
     group = get_pipeline_parallel_group()  # FIXME(tom) handle dynamic group
     set_device_based_on_group(group)
     kwargs["group"] = group
     kwargs["input_device"] = torch.device("cuda", torch.cuda.current_device())
     model = MultiProcessPipe(*args, **kwargs)
     model.cuda()
     global PipeModel
     PipeModel = model
Exemplo n.º 9
0
    def __init__(self, *args: Any, **kwargs: Any):
        super().__init__()
        self.group = cast(ProcessGroup, kwargs.get("group")) or get_pipeline_parallel_group()
        assert self.group.rank() == 0
        self.lock = Lock()

        if True:
            assert (
                self.group == get_pipeline_parallel_group()
            ), "Can't pickle groups, so group must be `get_pipeline_parallel_group()`"
            kwargs["group"] = None
        else:
            kwargs["group"] = self.group

        kwargs["style"] = MultiProcessPipe.AsyncSchedule
        kwargs["input_device"] = torch.device("cuda", torch.cuda.current_device())

        self.model = MultiProcessPipe(*args, **kwargs)
        self.worker_map = kwargs["worker_map"]
        self._foreach_worker(self._register_remote_model, args=(args, kwargs))
        self.model.cuda()
Exemplo n.º 10
0
    def _recv_result(model: Pipe, shapes: SizeOrSizes, dtypes: DtypeOrDtypes,
                     message: PipeMessage) -> TensorOrTensors:
        group = get_pipeline_parallel_group()
        set_device_based_on_group(group)

        assert model.pipeline
        transport = model.pipeline.transport

        if isinstance(shapes, torch.Size):
            message.tensor_shapes = [cast(torch.Size, shapes)]
            message.tensor_dtypes = [cast(torch.dtype, dtypes)]
            message = transport.recv_message_tensors(message)
            return message.tensors[0]
        else:
            message.tensor_shapes = cast(List[torch.Size], shapes)
            message.tensor_dtypes = cast(List[torch.dtype], dtypes)
            message = transport.recv_message_tensors(message)
            return message.tensors
Exemplo n.º 11
0
    def _send_result_and_do_backwards(training: bool, message: PipeMessage, grads_message: PipeMessage) -> None:
        group = get_pipeline_parallel_group()
        set_device_based_on_group(group)
        result = PipeResult
        model = PipeModel

        if isinstance(result, torch.Tensor):
            result = tuple([result])

        message.tensors = tuple(result)
        assert model.pipeline
        transport = model.pipeline.transport
        transport.send_message(message, sync=False, skip_header=True)

        if training:
            grads_message.tensor_shapes = [r.shape for r in result]
            grads_message.tensor_dtypes = [r.dtype for r in result]
            grads_message = transport.recv_message_tensors(grads_message)

            with model.lock:
                torch.autograd.backward(result, grads_message.tensors, retain_graph=True)
Exemplo n.º 12
0
def run_test_pipe(rank,
                  world_size,
                  filename,
                  filename_rpc,
                  skip_dist_init=False):
    pipe_world_size = 2

    if world_size == 1:
        return

    if not skip_dist_init:
        dist_init(rank, world_size, filename, filename_rpc)
    else:
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "29502"
        rpc.init_rpc(f"Test{rank}", rank=rank, world_size=world_size)

    mpu.initialize_model_parallel(world_size / pipe_world_size,
                                  pipe_world_size)
    model_parallel_size = mpu.get_model_parallel_world_size()
    if torch.distributed.get_rank() == 0:
        print(
            "> testing Sequential + MultiProcessPipe with model parallel size: {}, pipe: {}"
            .format(model_parallel_size, pipe_world_size))
    chunk_size = 4

    seed = 12345
    set_random_seed(seed)
    input_size_coeff = 3
    input_size = input_size_coeff * model_parallel_size
    output_size_coeff = 7
    output_size = output_size_coeff * model_parallel_size
    batch_size = 3 * chunk_size

    target = torch.rand((batch_size, input_size), requires_grad=True).cuda()
    print(f"target = {target}")

    identity = IdentityLayer2D(batch_size, input_size).cuda()

    pipeline_devices = mpu.get_pipeline_parallel_group()

    set_random_seed(seed)
    model = nn.Sequential(
        layers.ColumnParallelLinear(input_size,
                                    output_size,
                                    keep_master_weight_for_test=True,
                                    bias=False).cuda(),
        nn.ReLU(),
        layers.RowParallelLinear(output_size,
                                 input_size,
                                 keep_master_weight_for_test=True,
                                 bias=False).cuda(),
    )
    set_random_seed(seed)

    reference = [
        nn.Linear(input_size, output_size, bias=False).cuda(),
        nn.ReLU(),
        nn.Linear(output_size, input_size, bias=False).cuda(),
    ]

    print(
        f"setup {reference[0].weight.size()}, {model[0].weight.size()}, {(input_size, output_size)}"
    )
    print(f"setup {reference[2].weight.size()}, {(output_size, input_size)}")

    reference[0].weight = Parameter(
        model[0].get_master_weight().clone()).cuda()
    reference[2].weight = Parameter(
        model[2].get_master_weight().clone()).cuda()

    reference = nn.Sequential(*reference)

    def grad_graph(depth, grad):
        result = depth * " " + str(grad)
        if grad:
            for x in grad.next_functions:
                result += "\n" + grad_graph(depth + 1, x[0])
        return result

    def check_weights(x, y, key: str, index=None):
        for i in [2, 0]:
            if index is not None and i != index:
                continue
            left = x[i].get_master_weight()
            right = y[i].weight.data
            if not torch.allclose(left, right,
                                  atol=1.0e-6) or index is not None:
                print(
                    f"check_weights {key}-{i}: left = {left}, \nright = {right}"
                )
            if not torch.equal(left, right):
                print(
                    f"check_weights NOT_EQUAL {key}-{i}: left = {left}, \nright = {right}"
                )
            assert torch.allclose(left, right, atol=1.0e-6)

    def dump_opt_params(opt):
        for i, group in enumerate(opt.param_groups):
            for j, p in enumerate(group["params"]):
                print(f"{torch.distributed.get_rank()}:param {(i,j)} = {p}")
                print(
                    f"{torch.distributed.get_rank()}:param.grad {(i,j)} = {p.grad}"
                )

    def forward_model(model_, target, step=False):
        optimizer = torch.optim.SGD(model_.parameters(), lr=0.01, momentum=0.9)
        optimizer.zero_grad()
        model_.zero_grad()
        output = model_(identity())
        loss = nn.MSELoss()
        model_.zero_grad()
        if step:
            loss(output, target).backward()
            saved_weight_0 = model_[0].weight.data.clone()
            saved_weight_2 = model_[2].weight.data.clone()
            dump_opt_params(optimizer)
            optimizer.step()
            assert not torch.allclose(
                saved_weight_0, model_[0].weight.data, atol=1.0e-6)
            assert not torch.allclose(
                saved_weight_2, model_[2].weight.data, atol=1.0e-6)
        return output

    output = forward_model(model, target)
    reference_output = forward_model(reference, target)

    error = reference_output.sub(output).max()
    torch.distributed.barrier()
    assert error < 1.0e-6

    output = forward_model(model, target)
    error = reference_output.sub(output).max()
    torch.distributed.barrier()
    assert error < 1.0e-6

    output = forward_model(model, target)
    error = reference_output.sub(output).max()
    torch.distributed.barrier()
    assert error < 1.0e-6

    check_weights(model, reference, "before")
    saved_weight_0 = model[0].weight.data.clone()
    saved_weight_2 = model[2].weight.data.clone()
    output = forward_model(model, target, step=True)
    error = reference_output.sub(output).max()
    assert error < 1.0e-6
    model[0].weight.data = saved_weight_0
    model[2].weight.data = saved_weight_2

    worker_map = {
        i: f"Test{i}"
        for i in range(torch.distributed.get_world_size())
    }

    if pipe_world_size == 2:
        print("actually doing pipe stuff now")
        assert torch.equal(saved_weight_0, model[0].weight.data)
        assert torch.equal(saved_weight_2, model[2].weight.data)
        pipe_model = MultiProcessPipe(
            model,
            [2, 1],
            group=pipeline_devices,
            worker_map=worker_map,
            input_device=torch.cuda.current_device(),
            chunks=chunk_size,
        ).cuda()
        torch.distributed.barrier()
        pipe_rank = torch.distributed.get_rank(
            group=mpu.get_pipeline_parallel_group())
        print(f"pipe rank is {pipe_rank}")
        if pipe_rank == 0:
            assert torch.equal(saved_weight_0, pipe_model[0].weight.data)
        else:
            if not torch.equal(saved_weight_2, pipe_model[0].weight.data):
                print(
                    f"ne {pipe_rank}: left\n{saved_weight_2}\nright:\n{pipe_model[0].weight.data}"
                )
                assert torch.equal(saved_weight_2, pipe_model[0].weight.data)
        optimizer = torch.optim.SGD(pipe_model.parameters(),
                                    lr=0.01,
                                    momentum=0.9)
        optimizer.zero_grad()
        if pipe_rank == 0:
            assert torch.equal(saved_weight_0, pipe_model[0].weight.data)
            print(f"runner {rank}:\n{pipe_model[0].weight.data}")
        else:
            assert torch.equal(saved_weight_2, pipe_model[0].weight.data)
            print(f"runner {rank}:\n{pipe_model[0].weight.data}")

        if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1:
            check_weights(model, reference, "pre-pipe", index=2)
        else:
            check_weights(model, reference, "pre-pipe", index=0)

        pipe_output = pipe_model(identity())
        print(f"exited pipe for {rank}")
        forward_model(reference, target, step=True)

        print(f"pipe_output {rank} = {pipe_output}")
        print(f"reference_output {rank} = {reference_output}")

        torch.distributed.barrier()

        if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1:
            error = reference_output.sub(pipe_output.cuda()).max()
            if error >= 1.0e-6:
                print(f"error bad {error}")
            assert error < 1.0e-6

            loss = nn.MSELoss()
            failed = False
            pipe_output.retain_grad()
            with torch.autograd.profiler.profile() as prof:
                try:
                    loss(pipe_output, target).backward()
                except Exception as e:
                    failed = True
                    print(f"got {e} while doing backward, deadlock?")
            if failed:
                raise RuntimeError("failed somehow")
            dump_opt_params(optimizer)
            optimizer.step()

            print("calling check_weights on master")
            check_weights(model, reference, "pipe", index=2)
            print(f"waiting for barrier on master, pid={os.getpid()}")
        else:
            print(f"calling backwards on slave, pid={os.getpid()}")
            failed = False
            with torch.autograd.profiler.profile() as prof:
                try:
                    pipe_model.back_helper(pipe_output)
                except Exception as e:
                    failed = True
                    print(f"got {e} while doing backward, deadlock?")
            if failed:
                raise RuntimeError("failed somehow")
            dump_opt_params(optimizer)
            print("calling step on slave")
            optimizer.step()
            print("calling check_weights on slave")
            check_weights(model, reference, "pipe", index=0)
            print("waiting for barrier on slave")

        pipe_model.zero_grad()
        torch.distributed.barrier()

        pipe_model.eval()
        pipe_output = pipe_model(identity())
        updated_ref_output = forward_model(reference, target)
        if torch.distributed.get_rank(mpu.get_pipeline_parallel_group()) == 1:
            error = updated_ref_output.sub(pipe_output.cuda()).max()
            print(
                f"outputs are ref:\n{updated_ref_output}\npipe:\n{pipe_output}"
            )
            assert error < 1.0e-6
        torch.distributed.barrier()

        print(f"finished waiting for barrier on, pid={os.getpid()}")

    print(f"really exited pipe for {rank}")

    rpc.shutdown()
    torch.distributed.destroy_process_group()
Exemplo n.º 13
0
def run_test_pipe(rank, model_parallel_size):
    pipe_world_size = 2
    dist_init(rank, model_parallel_size)

    mpu.initialize_model_parallel(model_parallel_size)
    if torch.distributed.get_rank() == 0:
        print(
            "> testing Sequential + Pipe with model parallel size: {}, pipe: {}"
            .format(model_parallel_size, pipe_world_size))
    model_parallel_size = mpu.get_model_parallel_world_size()
    chunk_size = 8

    seed = 12345
    set_random_seed(seed)
    input_size_coeff = 13
    input_size = input_size_coeff * model_parallel_size
    output_size_coeff = 17
    output_size = output_size_coeff * model_parallel_size
    batch_size = 7 * chunk_size

    identity = IdentityLayer2D(batch_size, input_size).cuda()

    pipeline_devices = mpu.get_pipeline_parallel_group()
    if pipe_world_size == 2 and len(pipeline_devices) == 1:
        pipeline_devices.append(pipeline_devices[0] + model_parallel_size)

    set_random_seed(seed)
    model = nn.Sequential(
        layers.ColumnParallelLinear(input_size,
                                    output_size,
                                    keep_master_weight_for_test=True,
                                    bias=False).cuda(),
        nn.ReLU(),
        layers.RowParallelLinear(output_size,
                                 input_size,
                                 keep_master_weight_for_test=True,
                                 bias=False).cuda(),
    )

    set_random_seed(seed)
    reference = nn.Sequential(
        nn.Linear(input_size, output_size, bias=False).cuda(),
        nn.ReLU(),
        nn.Linear(output_size, input_size, bias=False).cuda(),
    )

    reference[0].weight.data = model[0].master_weight.cuda()
    reference[-1].weight.data = model[-1].master_weight.cuda()

    loss_weight = torch.randn([batch_size, output_size]).cuda()
    output = model(identity())
    reference_output = reference(identity())

    error = reference_output.sub(output).max()
    torch.distributed.barrier()
    assert error < 1.0e-6

    if pipe_world_size == 2:
        pipe_model = Pipe(model, [2, 1],
                          devices=pipeline_devices,
                          chunks=chunk_size)
        torch.distributed.barrier()
        pipe_output = pipe_model(identity())

        error = reference_output.sub(pipe_output.cuda()).max()
        torch.distributed.barrier()
        assert error < 1.0e-6