Python Animator示例，d2l.torch.Animator Python示例

示例#1

0

显示文件

文件： rnn.py 项目： EthanDBer/Algorithms

def train_ch8(net,
              train_iter,
              vocab,
              lr,
              num_epochs,
              device,
              use_random_iter=False):
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='perplexity',
                            legend=['train'],
                            xlim=[10, num_epochs])
    # 初始化
    if isinstance(net, nn.Module):
        updater = torch.optim.Adam(net.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device)
    # 训练和预测
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_ch8(net, train_iter, loss, updater, device,
                                     use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict('time traveller'))
            animator.add(epoch + 1, [ppl])
    print(f'困惑度 {ppl:.1f}, {speed:.1f} 标记/秒 {str(device)}')
    print(predict('time traveller'))
    print(predict('traveller'))

示例#2

0

显示文件

def train_ch8(model,
              train_iter,
              vocab,
              lr,
              num_epochs,
              device,
              use_random_iter=False):
    """Train a model (defined in Chapter 8)."""
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='perplexity',
                            legend=['train'],
                            xlim=[10, num_epochs])
    # Initialize
    if isinstance(model, nn.Module):  # false, skip
        updater = torch.optim.SGD(model.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(model.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 50, model, vocab, device)
    # Train and predict
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_ch8(model, train_iter, loss, updater, device,
                                     use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict('time traveller'))
            animator.add(epoch + 1, [ppl])
    print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}')
    print(predict('time traveller'))
    print(predict('traveller'))

示例#3

0

显示文件

def train(wd):
    net = nn.Sequential(nn.Linear(num_inputs, 1))
    for param in net.parameters():
        param.data.normal_()
    loss = nn.MSELoss()
    num_epochs, lr = 100, 0.003
    # The bias parameter has not decayed
    trainer = torch.optim.SGD([{
        "params": net[0].weight,
        'weight_decay': wd
    }, {
        "params": net[0].bias
    }],
                              lr=lr)
    animator = d2l.Animator(xlabel='epochs',
                            ylabel='loss',
                            yscale='log',
                            xlim=[5, num_epochs],
                            legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with torch.enable_grad():
                trainer.zero_grad()
                l = loss(net(X), y)
            l.backward()
            trainer.step()
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss),
                                     d2l.evaluate_loss(net, test_iter, loss)))
    print('L2 norm of w:', net[0].weight.norm().item())
    plt.show()

示例#4

0

显示文件

def train_ch8(net,
              train_iter,
              vocab,
              lr,
              num_epochs,
              device,
              use_random_iter=False):
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel="epoch",
                            ylabel="preplexity",
                            legend=["train"],
                            xlim=[10, num_epochs])
    if isinstance(net, nn.Module):
        updater = torch.optim.SGD(net.parameters(), lr)
    else:
        updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
    predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device)
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_ch8(net, train_iter, loss, updater, device,
                                     use_random_iter)
        if (epoch + 1) % 10 == 0:
            print(predict("time traveller"))
            animator.add(epoch + 1, [ppl])
    d2l.plt.show()
    print(f"困惑度 {ppl:.1f}, {speed:.1f} 标记/秒 {str(device)}")
    print(predict("time traveller"))
    print(predict("traveller"))

示例#5

0

显示文件

文件： 4_4_polynomial_regression.py 项目： nikisix/d2l

def train(train_features,
          test_features,
          train_labels,
          test_labels,
          num_epochs=400):
    loss = nn.MSELoss()
    input_shape = train_features.shape[-1]
    # Switch off the bias since we already catered for it in the polynomial
    # features
    net = nn.Sequential(nn.Linear(input_shape, 1, bias=False))
    batch_size = min(10, train_labels.shape[0])
    train_iter = d2l.load_array((train_features, train_labels.reshape(-1, 1)),
                                batch_size)
    test_iter = d2l.load_array((test_features, test_labels.reshape(-1, 1)),
                               batch_size,
                               is_train=False)
    trainer = torch.optim.SGD(net.parameters(), lr=0.01)
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='loss',
                            yscale='log',
                            xlim=[1, num_epochs],
                            ylim=[1e-3, 1e2],
                            legend=['train', 'test'])
    for epoch in range(num_epochs):
        d2l.train_epoch_ch3(net, train_iter, loss, trainer)
        if epoch == 0 or (epoch + 1) % 20 == 0:
            animator.add(epoch + 1, (evaluate_loss(
                net, train_iter, loss), evaluate_loss(net, test_iter, loss)))
    final_training_loss = evaluate_loss(net, train_iter, loss)
    print(f'final training loss: {final_training_loss}')
    print('weight:', net[0].weight.data.numpy())
    return final_training_loss

示例#6

0

显示文件

文件： RNNModel.py 项目： nikisix/d2l

 def train(self,
           net,
           train_iter,
           lr,
           num_epochs,
           device,
           use_random_iter=False):
     """Train a model (defined in Chapter 8)."""
     loss = nn.MSELoss()
     animator = d2l.Animator(xlabel='epoch',
                             ylabel='perplexity',
                             legend=['train'],
                             xlim=[10, num_epochs])
     # Initialize
     if isinstance(net, nn.Module):
         updater = torch.optim.SGD(net.parameters(), lr)
     else:
         updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size)
     # Train and predict
     for epoch in range(num_epochs):
         mse, speed = self.train_epoch(net, train_iter, loss, updater,
                                       device, use_random_iter)
         if (epoch + 1) % 10 == 0:
             animator.add(epoch + 1, [mse])
     # plt.show()
     print(
         f'mean squared loss {mse:.1f}, {speed:.1f} tokens/sec on {str(device)}'
     )

示例#7

0

显示文件

文件： d2l_pretrain_word2vec.py 项目： pd90506/torchnlp

def train(net, data_iter, lr, num_epochs, device=d2l.try_gpu()):
    def init_weights(m):
        if type(m) == nn.Embedding:
            nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='loss',
                            xlim=[1, num_epochs])
    metric = d2l.Accumulator(2)  # sum of losses, no. of tokens

    for epoch in range(num_epochs):
        timer, num_batches = d2l.Timer(), len(data_iter)
        for i, batch in enumerate(data_iter):
            optimizer.zero_grad()
            center, context_negative, mask, label = [
                data.to(device) for data in batch
            ]

            pred = skip_gram(center, context_negative, net[0], net[1])
            l = (loss(pred.reshape(label.shape).float(), label.float(), mask) /
                 mask.sum(axis=1) * mask.shape[1])
            l.sum().backward()
            optimizer.step()
            metric.add(l.sum(), l.numel())
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[1], ))
    print(f'loss {metric[0] / metric[1]:.3f}, '
          f'{metric[1] / timer.stop():.1f} tokens/sec on {str(device)}')

示例#8

0

显示文件

文件： same_breed.py 项目： nikisix/d2l

def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period,
          lr_decay):
    # Only train the small custom output network
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.SGD(
        (param for param in net.parameters() if param.requires_grad),
        lr=lr,
        momentum=0.9,
        weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay)
    num_batches, timer = len(train_iter), d2l.Timer()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[1, num_epochs],
                            legend=['train loss', 'valid loss'])
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(2)
        for i, (img1s, img2s, labels) in enumerate(train_iter):
            timer.start()

            # push onto gpu
            img1s = img1s.to(devices[0])
            img2s = img2s.to(devices[0])
            labels = labels.to(devices[0])

            trainer.zero_grad()
            import ipdb
            ipdb.set_trace()  # TODO BREAKPOINT

            # TODO YOU ARE HERE - choose how to cat the imgs
            # Then prepare the network to accept...
            # ipdb> torch.cat((img1s, img2s), 1).shape
            # torch.Size([4, 6, 224, 224])
            # ipdb> torch.cat((img1s, img2s), 0).shape
            # torch.Size([8, 3, 224, 224])
            # ipdb> torch.cat((img1s, img2s), 3).shape
            # torch.Size([4, 3, 224, 448])
            output1s = net(img1s)
            output2s = net(img2s)
            l = loss(output, labels).sum()
            l.backward()
            trainer.step()
            metric.add(l, labels.shape[0])
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[1], None))
        if valid_iter is not None:
            valid_loss = evaluate_loss(valid_iter, net, devices)
            animator.add(epoch + 1, (None, valid_loss))
        scheduler.step()
    if valid_iter is not None:
        print(f'train loss {metric[0] / metric[1]:.3f}, '
              f'valid loss {valid_loss:.3f}')
    else:
        print(f'train loss {metric[0] / metric[1]:.3f}')
    print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(devices)}')

示例#9

0

显示文件

def train_model(net,
                train_iter,
                test_iter,
                num_epochs,
                lr,
                device=d2l.try_gpu()):
    # for idx, (X, y) in enumerate(train_iter):
    """Train and evaluate a model with CPU or GPU."""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            torch.nn.init.xavier_uniform_(m.weight)  # Part 2.2

    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.BCELoss()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[0, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])

    timer = d2l.Timer()
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)  # train_loss, train_acc, num_examples
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            net.train()
            optimizer.zero_grad()
            X = X.float()
            X, y = X.to(device), y.to(device)
            output = net(X)
            # y_hat = torch.round(torch.exp(output)/(1+torch.exp(output)))
            y_hat = torch.sigmoid(output)
            y = y.to(torch.float)
            y = torch.unsqueeze(y, 1)
            l = loss(y_hat, y.type(torch.float32))  #.type(torch.float32)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_loss, train_acc = metric[0] / metric[2], metric[1] / metric[2]
            if (i + 1) % 50 == 0:
                animator.add(epoch + i / len(train_iter),
                             (train_loss, train_acc, None))
                print(
                    "BatchNo.=%3i, Epoch No.=%3i, loss=%.3f, train acc=%.3f" %
                    (i + 1, epoch + 1, train_loss, train_acc))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        print("test_acc=", test_acc)
        animator.add(epoch + 1, (None, None, test_acc))
    print('loss %.3f, train acc %.3f, test acc %.3f' %
          (train_loss, train_acc, test_acc))
    print('%.1f examples/sec on %s' %
          (metric[2] * num_epochs / timer.sum(), device))

示例#10

0

显示文件

def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
    """用GPU训练模型(在第六章定义)
    Defined in :numref:`sec_lenet`"""
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer = d2l.Timer()
    num_batches = len(train_iter)
    multiplier_anim = max(1, num_batches // 5)
    multiplier_save = max(1, num_epochs // 10)
    for epoch in tqdm(range(num_epochs)):
        # 训练损失之和，训练准确率之和，样本数
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % multiplier_anim == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        if test_iter is not None:
            test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)
        else:
            test_acc = 0
        animator.add(epoch + 1, (None, None, test_acc))

        # save net state_dict
        if epoch == epochs - 1 or (epoch + 1) % multiplier_save == 0:
            net_param_file = 'net_param_{:d}.pth'.format(epoch)
            path_net = os.path.join(net_save_dir, net_param_file)
            torch.save(net.state_dict(), path_net)
            print('net param saved to \n\t{}'.format(path_net))

    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

示例#11

0

显示文件

文件： 9_7_seq2seq_4.py 项目： nikisix/d2l

def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(xavier_init_weights)
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    net.train()
    animator = d2l.Animator(xlabel='epoch',
                            ylabel='loss',
                            xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        first = True
        for i, batch in enumerate(data_iter):
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)

            # 4. In training, replace teacher forcing with feeding the prediction at the previous time step into the
            # decoder. How does this influence the performance?

            if first:
                dec_input = d2l.concat([bos, Y[:, :-1]], 1)  # Teacher forcing
                first = False
            else:
                dec_input = Y_hat.argmax(dim=2)
                dec_input = dec_input[:X.shape[0]]

            Y_hat, _ = net(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2l.grad_clipping(net, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            animator.add(epoch + 1, (metric[0] / metric[1], ))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')

示例#12

0

显示文件

def train(lambd):
    w, b = init_params()
    net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss
    num_epochs, lr = 100, 0.003
    animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',
                            xlim=[5, num_epochs], legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with torch.enable_grad():
                # The L2 norm penalty term has been added, and broadcasting
                # makes `l2_penalty(w)` a vector whose length is `batch_size`
                l = loss(net(X), y) + lambd * l2_penalty(w)
            l.sum().backward()
            d2l.sgd([w, b], lr, batch_size)
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss),
                                     d2l.evaluate_loss(net, test_iter, loss)))
    print('L2 norm of w:', torch.norm(w).item())

示例#13

0

显示文件

文件： 6_6_lenet_torch.py 项目： nikisix/d2l

def train_ch6(net, train_iter, test_iter, num_epochs, lr,
              device=d2l.try_gpu()):
    """Train a model with a GPU (defined in Chapter 6)."""
    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight)

    net.apply(init_weights)
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    animator = d2l.Animator(xlabel='epoch',
                            xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer, num_batches = d2l.Timer(), len(train_iter)
    for epoch in range(num_epochs):
        # Sum of training loss, sum of training accuracy, no. of examples
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')
    plt.show()

示例#14

0

显示文件

def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.Adam(net.parameters(), lr=1e-3)
    step, timer = 0, d2l.Timer()
    animator = d2l.Animator(xlabel='step',
                            ylabel='loss',
                            xlim=[1, num_steps],
                            legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for tokens_X, segments_X, valid_lens_x, pred_positions_X,\
            mlm_weights_X, mlm_Y, nsp_y in train_iter:
            tokens_X = tokens_X.to(devices[0])
            segments_X = segments_X.to(devices[0])
            valid_lens_x = valid_lens_x.to(devices[0])
            pred_positions_X = pred_positions_X.to(devices[0])
            mlm_weights_X = mlm_weights_X.to(devices[0])
            mlm_Y, nsp_y = mlm_Y.to(devices[0]), nsp_y.to(devices[0])
            trainer.zero_grad()
            timer.start()
            mlm_l, nsp_l, l = _get_batch_loss_bert(net, loss, vocab_size,
                                                   tokens_X, segments_X,
                                                   valid_lens_x,
                                                   pred_positions_X,
                                                   mlm_weights_X, mlm_Y, nsp_y)
            l.backward()
            trainer.step()
            metric.add(mlm_l, nsp_l, tokens_X.shape[0], 1)
            timer.stop()
            animator.add(step + 1,
                         (metric[0] / metric[3], metric[1] / metric[3]))
            step += 1
            if step == num_steps:
                num_steps_reached = True
                break

    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentences pairs/sec on '
          f'{str(devices)}')

示例#15

0

显示文件

def train_bert(train_iter, net, loss, vocab_size, devices, num_steps):
    def xavier_init_weights(m):
        if type(m) is nn.Linear and\
            len(m.weight.shape)>1:
            torch.nn.init.xavier_uniform_(m.weight)

    net.apply(xavier_init_weights)
    optimizer = torch.optim.Adam(net.parameters())
    step, timer = 0, d2l.Timer()
    animator = d2l.Animator(xlabel='step',
                            ylabel='loss',
                            xlim=[1, num_steps],
                            legend=['mlm', 'nsp'])
    # Sum of masked language modeling losses, sum of next sentence prediction
    # losses, no. of sentence pairs, count
    metric = d2l.Accumulator(4)
    num_steps_reached = False
    while step < num_steps and not num_steps_reached:
        for batch in train_iter:
            # temp = list((batch[i][j] for j in range(len(batch[0])) for i in range(len(batch))))
            # (tokens_X_shards,segments_X_shards, valid_lens_x_shards,\
            #  pred_positions_X_shards, mlm_weights_X_shards,\
            #  mlm_Y_shards, nsp_y_shards)= tuple(temp[i:i+batch_size] for i in range(0,len(temp),batch_size))
            timer.start()
            optimizer.zero_grad()
            mlm_ls, nsp_ls, ls = _get_batch_loss_bert(batch)
            for l in ls:
                l.backward()
            optimizer.step()
        timer.stop()
        animator.add(step + 1, (metric[0] / metric[3], metric[1] / metric[3]))
        step += 1
        if step == num_steps:
            num_steps_reached = True
            break
    print(f'MLM loss {metric[0] / metric[3]:.3f}, '
          f'NSP loss {metric[1] / metric[3]:.3f}')
    print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on '
          f'{str(devices)}')

示例#16

0

显示文件

文件： 4_5_weight_decay_full.py 项目： nikisix/d2l

def train(λ):
    w, b = init_params()
    # net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss
    net = lambda X: torch.matmul(X, w) + b
    loss = lambda y_hat, y: (y_hat - y)**2

    num_epochs, lr = 20, .01
    animator = d2l.Animator(xlabel='epochs',
                            ylabel='loss',
                            yscale='log',
                            xlim=[5, num_epochs],
                            legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with (torch.enable_grad()):
                l = (y - (torch.matmul(X, w) + b))**2 + λ * l2_penalty(w)
                l.sum().backward()
                sgd([w, b], lr, batch_size)
        # print(l.sum()/batch_size)
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss),
                                     d2l.evaluate_loss(net, test_iter, loss)))
    print('L2 norm of w:', torch.norm(w).item())

示例#17

0

显示文件

文件： d2l.py 项目： JackeyLove1/Leetcode

def train_concise(wd):
    net = nn.Sequential(nn.Linear(num_inputs, 1))
    for param in net.parameters():
        param.data.normal_()
    loss = nn.MSELoss(reduction='none')
    num_epochs, lr = 100, 0.003
    # 偏置参数没有衰减
    trainer = torch.optim.SGD([
        {"params":net[0].weight,'weight_decay': wd},
        {"params":net[0].bias}], lr=lr)
    animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',
                            xlim=[5, num_epochs], legend=['train', 'test'])
    for epoch in range(num_epochs):
        for X, y in train_iter:
            trainer.zero_grad()
            l = loss(net(X), y)
            l.mean().backward()
            trainer.step()
        if (epoch + 1) % 5 == 0:
            animator.add(epoch + 1,
                         (d2l.evaluate_loss(net, train_iter, loss),
                          d2l.evaluate_loss(net, test_iter, loss)))
    print('w的L2范数：', net[0].weight.norm().item())

示例#18

0

显示文件

文件： kaggle_dog.py 项目： nikisix/d2l

def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period,
          lr_decay):
    # Only train the small custom output network
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    trainer = torch.optim.SGD((param for param in net.parameters()
                               if param.requires_grad), lr=lr,
                              momentum=0.9, weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay)
    num_batches, timer = len(train_iter), d2l.Timer()
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=['train loss', 'valid loss'])
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(2)
        for i, (features, labels) in enumerate(train_iter):
            timer.start()
            features, labels = features.to(devices[0]), labels.to(devices[0])
            trainer.zero_grad()
            output = net(features)
            l = loss(output, labels).sum()
            l.backward()
            trainer.step()
            metric.add(l, labels.shape[0])
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches, 
                             (metric[0] / metric[1], None))
        if valid_iter is not None:
            valid_loss = evaluate_loss(valid_iter, net, devices)
            animator.add(epoch + 1, (None, valid_loss))
        scheduler.step()
    if valid_iter is not None:
        print(f'train loss {metric[0] / metric[1]:.3f}, '
              f'valid loss {valid_loss:.3f}')
    else:
        print(f'train loss {metric[0] / metric[1]:.3f}')
    print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(devices)}')

示例#19

0

显示文件

def train_s2s_ch9(model, data_iter, lr, num_epochs, tgt_vocab, device):
    """Train a model for sequence to sequence (defined in Chapter 9)."""
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            torch.nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    torch.nn.init.xavier_uniform_(m._parameters[param])
    model.apply(xavier_init_weights)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    model.train()
    animator = d2l.Animator(xlabel='epoch', ylabel='loss',
                            xlim=[10, num_epochs])
    for epoch in range(num_epochs):
        timer = d2l.Timer()
        metric = d2l.Accumulator(2)  # Sum of training loss, no. of tokens
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0],
                               device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1)  # Teacher forcing
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            l.sum().backward()  # Make the loss scalar for `backward`
            d2l.grad_clipping(model, 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        if (epoch + 1) % 10 == 0:
            animator.add(epoch + 1, (metric[0] / metric[1],))
    print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} '
          f'tokens/sec on {str(device)}')

示例#20

0

显示文件

文件： 10_2_nadaraya_watson.py 项目： nikisix/d2l

X_tile = x_train.repeat((n_train, 1))
# Shape of `Y_tile`: (`n_train`, `n_train`), where each column contains the
# same training outputs
Y_tile = y_train.repeat((n_train, 1))
# Shape of `keys`: ('n_train', 'n_train' - 1)
keys = d2l.reshape(X_tile[(1 - d2l.eye(n_train)).type(torch.bool)],
                   (n_train, -1))
# Shape of `values`: ('n_train', 'n_train' - 1)
values = d2l.reshape(Y_tile[(1 - d2l.eye(n_train)).type(torch.bool)],
                     (n_train, -1))


net = NWKernelRegression()
loss = nn.MSELoss(reduction='none')
trainer = torch.optim.SGD(net.parameters(), lr=0.5)
animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, 5])

for epoch in range(5):
    trainer.zero_grad()
    # Note: L2 Loss = 1/2 * MSE Loss. PyTorch has MSE Loss which is slightly
    # different from MXNet's L2Loss by a factor of 2. Hence we halve the loss
    l = loss(net(x_train, keys, values), y_train) / 2
    l.sum().backward()
    trainer.step()
    print(f'epoch {epoch + 1}, loss {float(l.sum()):.6f}')
    animator.add(epoch + 1, float(l.sum()))


# Shape of `keys`: (`n_test`, `n_train`), where each column contains the same
# training inputs (i.e., same keys)
keys = x_train.repeat((n_test, 1))

示例#21

0

显示文件

                # sgd([w, b, λ], lr, batch_size)  # all nan losses
        training_loss = d2l.evaluate_loss(net, train_iter, loss)
        test_loss = d2l.evaluate_loss(net, test_iter, loss)
        # eval_loss = d2l.evaluate_loss(net, eval_iter, loss)
        # print(training_loss, test_loss, eval_loss, λ)
        print(training_loss, test_loss, λ)
        animator.add(epoch, (training_loss, test_loss))

        # with torch.enable_grad():
            # eval_loss = d2l.evaluate_loss(net, eval_iter, loss)  # float but need tensor
        # eval_loss.backward()
        # with torch.no_grad():
            # λ -= lr * λ.grad
            # λ.grad.zero_()

animator = d2l.Animator(xlabel='lambda', ylabel='loss', yscale='log',
                        xlim=[5, num_epochs], legend=['train', 'test'])
train_and_eval()

"""
5. In Bayesian statistics we use the product of prior and likelihood to arrive at a posterior via P(w∣x) ∝ P(x∣w)P(w) .
How can you identify P(w) with regularization?

FROM:
https://en.wikipedia.org/wiki/Regularization_(mathematics)#cite_note-4
A theoretical justification for regularization is that it attempts to impose Occam's razor on the solution (as depicted
in the figure above, where the green function, the simpler one, may be preferred). From a Bayesian point of view, many
regularization techniques correspond to imposing certain prior distributions on model parameters.

FROM:
https://en.wikipedia.org/wiki/Tikhonov_regularization#Bayesian_interpretation
Statistically, the prior probability distribution of x {\displaystyle x} x is sometimes taken to be a multivariate

示例#22

0

显示文件


#%%
vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
model = d2l.RNNModelScratch(len(vocab), num_hiddens, device, get_params,
                            init_gru_state, gru)
d2l.train_ch8(model, train_iter, vocab, lr, num_epochs, device)
#%%
vocab_size, num_hiddens, device = len(vocab), 256, d2l.try_gpu()
num_epochs, lr = 500, 1
loss = torch.nn.CrossEntropyLoss()
params = get_params(vocab_size, num_hiddens, device)
optimizer = torch.optim.SGD(params, lr)
animator = d2l.Animator(xlabel='epoch',
                        ylabel='perplexity',
                        legend=['train'],
                        xlim=[10, num_epochs])
for epoch in range(num_epochs):
    timer = d2l.Timer()
    metric = [0.0, 0.0]
    # X, Y : (batch_size, num_steps)
    for X, Y in train_iter:
        state = init_gru_state(batch_size, num_hiddens, device)
        # Y: (batch_size, num_steps) - > (num_steps * batch_size)
        y = Y.T.reshape(-1)
        # X: (batch_size, num_steps) - > (num_steps, batch_size, vocab_size)
        X = F.one_hot(X.T, vocab_size).type(torch.float32)
        X, y = X.to(device), y.to(device)
        y_hat, state = gru(X, state, params)

        # The `input` is expected to contain raw, unnormalized scores for each class.