Пример #1
0
def train(net_D,
          net_G,
          data_iter,
          num_epochs,
          lr,
          latent_dim,
          device=d2l.try_gpu()):
    loss = torch.nn.BCEWithLogitsLoss()
    # net_D.initialize(init=init.Normal(0.02), force_reinit=True, ctx=device)
    # net_G.initialize(init=init.Normal(0.02), force_reinit=True, ctx=device)
    trainer_hp = {'lr': lr, 'betas': [0.5, 0.999]}
    trainer_D = torch.optim.Adam(net_D.parameters(), **trainer_hp)
    trainer_G = torch.optim.Adam(net_G.parameters(), **trainer_hp)
    for epoch in range(1, num_epochs + 1):
        print('Epoch', epoch)
        # Train one epoch
        timer = d2l.Timer()
        metric = d2l.Accumulator(3)  # loss_D, loss_G, num_examples
        for X, _ in data_iter:
            print('Processing batch')
            batch_size = X.shape[0]
            Z = torch.normal(0, 1, size=(batch_size, latent_dim, 1, 1))
            # X, Z = X.as_in_ctx(device), Z.as_in_ctx(device),
            metric.add(update_D(X, Z, net_D, net_G, loss, trainer_D),
                       update_G(Z, net_D, net_G, loss, trainer_G), batch_size)
        # Show the losses
        loss_D, loss_G = metric[0] / metric[2], metric[1] / metric[2]
        print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}')
    print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}, '
          f'{metric[2] / timer.stop():.1f} examples/sec on {str(device)}')
Пример #2
0
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter):
    state, timer = None, d2l.Timer()
    metric = d2l.Accumulator(2)  # 训练损失之和, 标记数量
    for X, Y in train_iter:
        if state is None or use_random_iter:
            # 在第一次迭代或使用随机抽样时初始化`state`
            state = net.begin_state(batch_size=X.shape[0],
                                    device=device)  # 初始化其实隐藏状态H0
        else:
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                # `state`对于`nn.GRU`是个张量
                state.detach_()
            else:
                # `state`对于`nn.LSTM`或对于我们从零开始实现的模型是个张量
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)  # __call__
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            # 因为已经调用了`mean`函数
            updater(batch_size=1)
        metric.add(l * y.numel(), y.numel())
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
Пример #3
0
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter):
    state, timer = None, d2l.Timer()
    metric = d2l.Accumulator(2)
    for X, Y in train_iter:
        if state is None or use_random_iter:
            state = net.begin_state(batch_size=X.shape[0], device=device)
        else:
            if isinstance(net, nn.Module) and not isinstance(state, tuple):
                state.detach_()
            else:
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        y_hat, state = net(X, state)
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(net, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(net, 1)
            updater(batch_size=1)
        metric.add(l * y.numel(), y.numel())
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
Пример #4
0
def train_epoch_ch8(model, train_iter, loss, updater, device,  #@save
                    use_random_iter):
    """Train a model within one epoch (defined in Chapter 8)."""
    state, timer = None, d2l.Timer()
    metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
    for X, Y in train_iter:
        if state is None or use_random_iter:
            # Initialize `state` when either it is the first iteration or
            # using random sampling
            state = model.begin_state(batch_size=X.shape[0], device=device)
        else:
            if isinstance(model, nn.Module) and not isinstance(state, tuple):
                # `state` is a tensor for `nn.GRU`
                state.detach_()
            else:
                # `state` is a tuple of tensors for `nn.LSTM` and
                # for our custom scratch implementation
                for s in state:
                    s.detach_()
        y = Y.T.reshape(-1)
        X, y = X.to(device), y.to(device)
        y_hat, state = model(X, state)
        l = loss(y_hat, y.long()).mean()
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            l.backward()
            grad_clipping(model, 1)
            updater.step()
        else:
            l.backward()
            grad_clipping(model, 1)
            # Since the `mean` function has been invoked
            updater(batch_size=1)
        metric.add(l * d2l.size(y), d2l.size(y))
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
Пример #5
0
def train(net, data_iter, lr, num_epochs, device=d2l.try_gpu()):
    def init_weights(m):
        if type(m) == nn.Embedding:
            nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='loss',
                            xlim=[1, num_epochs])
    metric = d2l.Accumulator(2)  # sum of losses, no. of tokens

    for epoch in range(num_epochs):
        timer, num_batches = d2l.Timer(), len(data_iter)
        for i, batch in enumerate(data_iter):
            optimizer.zero_grad()
            center, context_negative, mask, label = [
                data.to(device) for data in batch
            ]

            pred = skip_gram(center, context_negative, net[0], net[1])
            l = (loss(pred.reshape(label.shape).float(), label.float(), mask) /
                 mask.sum(axis=1) * mask.shape[1])
            l.sum().backward()
            optimizer.step()
            metric.add(l.sum(), l.numel())
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[1], ))
    print(f'loss {metric[0] / metric[1]:.3f}, '
          f'{metric[1] / timer.stop():.1f} tokens/sec on {str(device)}')
Пример #6
0
def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period,
          lr_decay):
    # Only train the small custom output network
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.SGD(
        (param for param in net.parameters() if param.requires_grad),
        lr=lr,
        momentum=0.9,
        weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay)
    num_batches, timer = len(train_iter), d2l.Timer()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[1, num_epochs],
                            legend=['train loss', 'valid loss'])
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(2)
        for i, (img1s, img2s, labels) in enumerate(train_iter):
            timer.start()

            # push onto gpu
            img1s = img1s.to(devices[0])
            img2s = img2s.to(devices[0])
            labels = labels.to(devices[0])

            trainer.zero_grad()
            import ipdb
            ipdb.set_trace()  # TODO BREAKPOINT

            # TODO YOU ARE HERE - choose how to cat the imgs
            # Then prepare the network to accept...
            # ipdb> torch.cat((img1s, img2s), 1).shape
            # torch.Size([4, 6, 224, 224])
            # ipdb> torch.cat((img1s, img2s), 0).shape
            # torch.Size([8, 3, 224, 224])
            # ipdb> torch.cat((img1s, img2s), 3).shape
            # torch.Size([4, 3, 224, 448])
            output1s = net(img1s)
            output2s = net(img2s)
            l = loss(output, labels).sum()
            l.backward()
            trainer.step()
            metric.add(l, labels.shape[0])
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[1], None))
        if valid_iter is not None:
            valid_loss = evaluate_loss(valid_iter, net, devices)
            animator.add(epoch + 1, (None, valid_loss))
        scheduler.step()
    if valid_iter is not None:
        print(f'train loss {metric[0] / metric[1]:.3f}, '
              f'valid loss {valid_loss:.3f}')
    else:
        print(f'train loss {metric[0] / metric[1]:.3f}')
    print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(devices)}')
Пример #7
0
def train_model(net,
                train_iter,
                test_iter,
                num_epochs,
                lr,
                device=d2l.try_gpu()):
    # for idx, (X, y) in enumerate(train_iter):
    """Train and evaluate a model with CPU or GPU."""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            torch.nn.init.xavier_uniform_(m.weight)  # Part 2.2

    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.BCELoss()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[0, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])

    timer = d2l.Timer()
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)  # train_loss, train_acc, num_examples
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            net.train()
            optimizer.zero_grad()
            X = X.float()
            X, y = X.to(device), y.to(device)
            output = net(X)
            # y_hat = torch.round(torch.exp(output)/(1+torch.exp(output)))
            y_hat = torch.sigmoid(output)
            y = y.to(torch.float)
            y = torch.unsqueeze(y, 1)
            l = loss(y_hat, y.type(torch.float32))  #.type(torch.float32)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_loss, train_acc = metric[0] / metric[2], metric[1] / metric[2]
            if (i + 1) % 50 == 0:
                animator.add(epoch + i / len(train_iter),
                             (train_loss, train_acc, None))
                print(
                    "BatchNo.=%3i, Epoch No.=%3i, loss=%.3f, train acc=%.3f" %
                    (i + 1, epoch + 1, train_loss, train_acc))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print("test_acc=", test_acc)
        animator.add(epoch + 1, (None, None, test_acc))
    print('loss %.3f, train acc %.3f, test acc %.3f' %
          (train_loss, train_acc, test_acc))
    print('%.1f examples/sec on %s' %
          (metric[2] * num_epochs / timer.sum(), device))
Пример #8
0
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
    """用GPU训练模型(在第六章定义)
    Defined in :numref:`sec_lenet`"""
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer = d2l.Timer()
    num_batches = len(train_iter)
    multiplier_anim = max(1, num_batches // 5)
    multiplier_save = max(1, num_epochs // 10)
    for epoch in tqdm(range(num_epochs)):
        # 训练损失之和,训练准确率之和,样本数
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % multiplier_anim == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        if test_iter is not None:
            test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)
        else:
            test_acc = 0
        animator.add(epoch + 1, (None, None, test_acc))

        # save net state_dict
        if epoch == epochs - 1 or (epoch + 1) % multiplier_save == 0:
            net_param_file = 'net_param_{:d}.pth'.format(epoch)
            path_net = os.path.join(net_save_dir, net_param_file)
            torch.save(net.state_dict(), path_net)
            print('net param saved to \n\t{}'.format(path_net))

    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')
Пример #9
0
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='loss',
                            xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        first = True
        for i, batch in enumerate(data_iter):
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)

            # 4. In training, replace teacher forcing with feeding the prediction at the previous time step into the
            # decoder. How does this influence the performance?

            if first:
                dec_input = d2l.concat([bos, Y[:, :-1]], 1)  # Teacher forcing
                first = False
            else:
                dec_input = Y_hat.argmax(dim=2)
                dec_input = dec_input[:X.shape[0]]

            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2l.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            animator.add(epoch + 1, (metric[0] / metric[1], ))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')
Пример #10
0
def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.Adam(net.parameters(), lr=1e-3)
    step, timer = 0, d2l.Timer()
    animator = d2l.Animator(xlabel='step',
                            ylabel='loss',
                            xlim=[1, num_steps],
                            legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for tokens_X, segments_X, valid_lens_x, pred_positions_X,\
            mlm_weights_X, mlm_Y, nsp_y in train_iter:
            tokens_X = tokens_X.to(devices[0])
            segments_X = segments_X.to(devices[0])
            valid_lens_x = valid_lens_x.to(devices[0])
            pred_positions_X = pred_positions_X.to(devices[0])
            mlm_weights_X = mlm_weights_X.to(devices[0])
            mlm_Y, nsp_y = mlm_Y.to(devices[0]), nsp_y.to(devices[0])
            trainer.zero_grad()
            timer.start()
            mlm_l, nsp_l, l = _get_batch_loss_bert(net, loss, vocab_size,
                                                   tokens_X, segments_X,
                                                   valid_lens_x,
                                                   pred_positions_X,
                                                   mlm_weights_X, mlm_Y, nsp_y)
            l.backward()
            trainer.step()
            metric.add(mlm_l, nsp_l, tokens_X.shape[0], 1)
            timer.stop()
            animator.add(step + 1,
                         (metric[0] / metric[3], metric[1] / metric[3]))
            step += 1
            if step == num_steps:
                num_steps_reached = True
                break

    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentences pairs/sec on '
          f'{str(devices)}')
Пример #11
0
def train_ch6(net, train_iter, test_iter, num_epochs, lr,
              device=d2l.try_gpu()):
    """Train a model with a GPU (defined in Chapter 6)."""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        # Sum of training loss, sum of training accuracy, no. of examples
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')
    plt.show()
Пример #12
0
def train_func(net,
               train_iter,
               test_iter,
               num_epochs,
               lr,
               device=d2l.try_gpu()):
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            torch.nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    timer = d2l.Timer()
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            net.train()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_loss = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % 50 == 0:
                print(f"epoch: {epoch} --- iter: {i} --- of {len(train_iter)}")
                print(f"train loss: {train_loss} --- train acc: {train_acc}")
        test_acc = evaluate_accuracy_gpu(net, test_iter)
    print(f'loss {train_loss:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')
Пример #13
0
def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    def xavier_init_weights(m):
        if type(m) is nn.Linear and\
            len(m.weight.shape)>1:
            torch.nn.init.xavier_uniform_(m.weight)

    net.apply(xavier_init_weights)
    optimizer = torch.optim.Adam(net.parameters())
    step, timer = 0, d2l.Timer()
    animator = d2l.Animator(xlabel='step',
                            ylabel='loss',
                            xlim=[1, num_steps],
                            legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for batch in train_iter:
            # temp = list((batch[i][j] for j in range(len(batch[0])) for i in range(len(batch))))
            # (tokens_X_shards,segments_X_shards, valid_lens_x_shards,\
            #  pred_positions_X_shards, mlm_weights_X_shards,\
            #  mlm_Y_shards, nsp_y_shards)= tuple(temp[i:i+batch_size] for i in range(0,len(temp),batch_size))
            timer.start()
            optimizer.zero_grad()
            mlm_ls, nsp_ls, ls = _get_batch_loss_bert(batch)
            for l in ls:
                l.backward()
            optimizer.step()
        timer.stop()
        animator.add(step + 1, (metric[0] / metric[3], metric[1] / metric[3]))
        step += 1
        if step == num_steps:
            num_steps_reached = True
            break
    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on '
          f'{str(devices)}')
Пример #14
0
 def train_epoch(self, net, train_iter, loss, updater, device,
                 use_random_iter):
     """Train a net within one epoch (defined in Chapter 8)."""
     state, timer = None, d2l.Timer()
     metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
     i = 0
     for X, Y in train_iter:
         X = X.to(torch.float32)  # TODO is this necessary?
         Y = Y.to(torch.float32)
         X = X.reshape(-1, *X.shape)  # [direction, batch_size, seq_len]
         if state is None or use_random_iter:
             # Initialize `state` when either it is the first iteration or
             # using random sampling
             state = net.begin_state(batch_size=X.shape[1], device=device)
         else:
             if isinstance(net, nn.Module) and not isinstance(state, tuple):
                 # `state` is a tensor for `nn.GRU`
                 state.detach_()
             else:
                 # `state` is a tuple of tensors for `nn.LSTM` and
                 # for our custom scratch implementation
                 for s in state:
                     s.detach_()
         # y = Y.T.reshape(-1)
         X, Y = X.to(device), Y.to(device)
         y_hat, state = net(X, state)
         l = loss(y_hat, Y).mean()
         if isinstance(updater, torch.optim.Optimizer):
             updater.zero_grad()
             l.backward()
             self.grad_clipping(net, 1)
             updater.step()
         else:
             l.backward()
             self.grad_clipping(net, 1)
             # Since the `mean` function has been invoked
             updater(batch_size=1)
         metric.add(l * d2l.size(Y), d2l.size(Y))
     return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
Пример #15
0
def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period,
          lr_decay):
    # Only train the small custom output network
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.SGD((param for param in net.parameters()
                               if param.requires_grad), lr=lr,
                              momentum=0.9, weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay)
    num_batches, timer = len(train_iter), d2l.Timer()
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=['train loss', 'valid loss'])
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(2)
        for i, (features, labels) in enumerate(train_iter):
            timer.start()
            features, labels = features.to(devices[0]), labels.to(devices[0])
            trainer.zero_grad()
            output = net(features)
            l = loss(output, labels).sum()
            l.backward()
            trainer.step()
            metric.add(l, labels.shape[0])
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches, 
                             (metric[0] / metric[1], None))
        if valid_iter is not None:
            valid_loss = evaluate_loss(valid_iter, net, devices)
            animator.add(epoch + 1, (None, valid_loss))
        scheduler.step()
    if valid_iter is not None:
        print(f'train loss {metric[0] / metric[1]:.3f}, '
              f'valid loss {valid_loss:.3f}')
    else:
        print(f'train loss {metric[0] / metric[1]:.3f}')
    print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(devices)}')
Пример #16
0
def train_s2s_ch9(model, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence (defined in Chapter 9)."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    torch.nn.init.xavier_uniform_(m._parameters[param])

    model.apply(xavier_init_weights)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    model.train()
    # animator = d2l.Animator(xlabel='epoch', ylabel='loss',
    #                         xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2l.grad_clipping(model, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        # if (epoch + 1) % 10 == 0:
        #     animator.add(epoch + 1, (metric[0] / metric[1],))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')
Пример #17
0
def train(resume_training=True):

    EMBEDDING_SIZE = 32
    num_hiddens, num_layers, dropout, batch_size, num_steps = EMBEDDING_SIZE, 2, 0.1, 64, 10
    lr, num_epochs, device = 0.005, 1000, d2lt.try_gpu()
    ffn_num_input, ffn_num_hiddens, num_heads = EMBEDDING_SIZE, 64, 4
    key_size, query_size, value_size = EMBEDDING_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE
    norm_shape = [EMBEDDING_SIZE]

    ### Load data
    data_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps)
    encoder = TransformerEncoder(len(src_vocab), key_size, query_size,
                                 value_size, num_hiddens, norm_shape,
                                 ffn_num_input, ffn_num_hiddens, num_heads,
                                 num_layers, dropout)
    decoder = TransformerDecoder(len(tgt_vocab), key_size, query_size,
                                 value_size, num_hiddens, norm_shape,
                                 ffn_num_input, ffn_num_hiddens, num_heads,
                                 num_layers, dropout)

    ### Load model
    model = EncoderDecoder(encoder, decoder).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    ### Load checkpoint
    if resume_training and PATH_MODEL.exists(
    ) and os.path.getsize(PATH_MODEL) > 0:
        model, optimizer, last_epoch = load_checkpoint(model, optimizer)
        print("Continue training from last checkpoint...")
    else:
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)
        with open(PATH_MODEL, 'w') as fp:
            pass
        print(
            'No prior checkpoint existed, created new save files for checkpoint.'
        )
        model.apply(xavier_init_weights)
        last_epoch = 0

    # model.apply(xavier_init_weights)
    # model.to(device)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    ### Initialize Loss functions
    loss = MaskedSoftmaxCELoss()

    ### Train
    model.train()
    # animator = d2lt.Animator(xlabel='epoch', ylabel='loss',
    # xlim=[10, num_epochs])
    for epoch in range(last_epoch, num_epochs):
        timer = d2lt.Timer()
        metric = d2lt.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2lt.grad_clipping(model, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            # animator.add(epoch + 1, (metric[0] / metric[1],))
            print(f'epoch {epoch + 1} - ' f'loss {metric[0] / metric[1]:.5f}')

        ### Save checkpoint
        save_checkpoint(epoch, model, optimizer)
    print(f'loss {metric[0] / metric[1]:.5f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')
Пример #18
0


batch_size = 256

def get_dataloader_workers():  #@save
    """Use 4 processes to read the data."""
    return 4

train_iter = data.DataLoader(mnist_train, batch_size, shuffle=True,
                             num_workers=get_dataloader_workers())




timer = d2l.Timer()
for X, y in train_iter:
    continue
f'{timer.stop():.2f} sec'





def load_data_fashion_mnist(batch_size, resize=None):  #@save
    """Download the Fashion-MNIST dataset and then load it into memory."""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(