Пример #1
0
def test_simple_linears():
    def sum_grad(parameters):
        return sum([p.grad.sum() for p in parameters if p.grad is not None])

    def zero_grad(parameters):
        for p in parameters:
            p.grad = None

    inputs = torch.rand(8, 1)
    model = nn.Sequential(nn.Linear(1, 2), nn.Linear(2, 4), nn.Linear(4, 2), nn.Linear(2, 1),)

    # Without Pipe
    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()

    grad_without_pipe = sum_grad(model.parameters())

    zero_grad(model.parameters())

    # With Pipe
    model = Pipe(model, [2, 2], devices=["cpu", "cpu"], chunks=4)

    outputs = model(inputs)
    loss = outputs.mean()
    loss.backward()

    grad_with_pipe = sum_grad(model.parameters())

    # Both grads should be identical.
    assert torch.allclose(grad_with_pipe, grad_without_pipe)
Пример #2
0
def _train_pipe_model(model, use_fp16=False, checkpoint="never", chunks=1):
    model = copy.deepcopy(model)
    model = Pipe(
        model,
        balance=[1] * torch.cuda.device_count(),
        devices=list(range(torch.cuda.device_count())),
        chunks=chunks,
        checkpoint=checkpoint,
    )
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    return _train(model, optimizer, use_fp16)
Пример #3
0
def make_model(device, ntokens):
    ninp = 50  # embedding dimension
    nhid = 50  # the dimension of the feedforward network model in nn.TransformerEncoder
    nhead = 2  # the number of heads in the multiheadattention models
    dropout = 0
    initrange = 0.1

    model = TransformerLMSequntial(ntokens, ninp, nhead, nhid, dropout, initrange).half().to(device)
    balance = generate_balance(min(num_devices, 4), len(model))
    p = Pipe(model, balance, chunks=len(balance))

    criterion = nn.CrossEntropyLoss()
    lr = 0.001  # learning rate

    try:
        optimizer = Adam(p.parameters(), lr=lr, precision=Precision.PURE_FP16)
    except NameError:
        optimizer = Adam(p.parameters(), lr=lr)
    scaler = GradScaler()

    return p, criterion, optimizer, scaler
Пример #4
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument("--batch-size",
                        type=int,
                        default=64,
                        metavar="N",
                        help="input batch size for training (default: 64)")
    parser.add_argument("--test-batch-size",
                        type=int,
                        default=1000,
                        metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs",
                        type=int,
                        default=14,
                        metavar="N",
                        help="number of epochs to train (default: 14)")
    parser.add_argument("--lr",
                        type=float,
                        default=1.0,
                        metavar="LR",
                        help="learning rate (default: 1.0)")
    parser.add_argument("--gamma",
                        type=float,
                        default=0.7,
                        metavar="M",
                        help="Learning rate step gamma (default: 0.7)")
    parser.add_argument("--dry-run",
                        action="store_true",
                        default=False,
                        help="quickly check a single pass")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10,
        metavar="N",
        help="how many batches to wait before logging training status",
    )
    parser.add_argument("--save-model",
                        action="store_true",
                        default=False,
                        help="For Saving the current Model")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    kwargs = {"batch_size": args.batch_size}
    kwargs.update({"num_workers": 1, "pin_memory": True, "shuffle": True}, )

    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    dataset1 = datasets.MNIST("../data",
                              train=True,
                              download=True,
                              transform=transform)
    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **kwargs)
    test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)

    model = net
    model = Pipe(model, balance=[6, 6], devices=[0, 1], chunks=2)
    device = model.devices[0]

    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    for epoch in range(1, args.epochs + 1):
        tic = time.perf_counter()
        train(args, model, device, train_loader, optimizer, epoch)
        toc = time.perf_counter()
        print(f">>> TRANING Time {toc - tic:0.4f} seconds")

        tic = time.perf_counter()
        test(model, device, test_loader)
        toc = time.perf_counter()
        print(f">>> TESTING Time {toc - tic:0.4f} seconds")
        scheduler.step()

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")