def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size, columnName): net = get_net(train_features.shape[1]) train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size) d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch', ylabel='log rmse', xlim=[1, num_epochs], yscale='log') d2l.plt.show() print(f'train log rmse {float(train_ls[-1]):f}') preds = net(test_features).detach().numpy() num = test_data['filename'].shape[0] d2l.plot(list(range(1, num + 1)), [test_data[columnName], preds], xlabel='', ylabel='APS', xlim=[1, num], legend=['真实值', '预测值']) d2l.plt.rcParams['font.sans-serif'] = 'SimHei' d2l.plt.rcParams['axes.unicode_minus'] = False # 设置正常显示符号 d2l.plt.show() submission = pd.concat( [test_data['filename'], pd.Series(preds.reshape(1, -1)[0])], axis=1) submission.to_csv('submission.csv', index=False)
def plot_kernel_reg(y_hat): d2l.plot(x_test, [y_truth, y_hat], 'x', 'y', legend=['Truth', 'Pred'], xlim=[0, 5], ylim=[-1, 5]) d2l.plt.plot(x_train, y_train, 'o', alpha=0.5)
def test_PositionEncoding(): encoding_dim, num_steps = 32, 60 pos_encoding = PositionEncoding(encoding_dim, 0) pos_encoding.eval() X = pos_encoding(torch.zeros((1, num_steps, encoding_dim))) P = pos_encoding.P[:, :X.shape[1], :] d2l.plot(torch.arange(num_steps), P[0, :, 6:10].T, xlabel='Row (position)', figsize=(6, 3.5), legend=["Col %d" % d for d in torch.arange(6, 10)]) d2l.plt.show()
def train_and_pred(train_features, test_feature, train_labels, test_data, num_epochs, lr, weight_decay, batch_size): net = get_net() train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size) d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch', ylabel='log rmse', xlim=[1, num_epochs], yscale='log') print(f'train log rmse {float(train_ls[-1]):f}') # Apply the network to the test set preds = net(test_features).detach().numpy() # Reformat it to export to Kaggle test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0]) submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1) submission.to_csv('submission.csv', index=False)
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size): train_l_sum, valid_l_sum = 0, 0 for i in range(k): data = get_k_fold_data(k, i, X_train, y_train) net = get_net() train_ls, valid_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size) train_l_sum += train_ls[-1] valid_l_sum += valid_ls[-1] if i == 0: d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls], xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs], legend=['train', 'valid'], yscale='log') print(f'fold {i + 1}, train log rmse {float(train_ls[-1]):f}, ' f'valid log rmse {float(valid_ls[-1]):f}') return train_l_sum / k, valid_l_sum / k
#%% #Vanishing gradient %matplotlib inline import torch from d2l import torch as d2l from matplotlib import pyplot as plt import numpy as np x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True) y = torch.sigmoid(x) y.backward(torch.ones_like(x)) d2l.plot(x.detach().numpy(), [y.detach().numpy(), x.grad.numpy()], legend=['sigmoid', 'gradient'], figsize=(4.5, 2.5)) plt.figure(figsize=(8, 4)) plt.plot(x.detach(), y.detach()) plt.plot(x.detach(), x.grad.detach()) plt.legend(['sigmoid', 'gradient']) plt.show() # %% #Gradient Exploding M = torch.normal(0, 1, size=(4, 4)) print('a single matrix \n', M) for i in range(100): M = torch.mm(M, torch.normal(0, 1, size=(4, 4))) print('after multiplying 100 matrices\n', M) # %% import hashlib
# Create a long enough `P` self.P = torch.zeros((1, max_len, num_hiddens)) X = torch.arange(0, max_len , dtype=torch.float32).reshape( -1,1)/torch.pow(10000,torch.arange( 0, num_hiddens, 2, dtype=torch.float32) / num_hiddens) self.P[:,:,0::2] = torch.sin(X) self.P[:,:,1::2] = torch.cos(X) def forward(self, X): X = X + self.P[:, :X.shape[1],:].to(X.device) return self.dropout(X) pe = PositionalEncoding(20, 0) pe.eval() Y = pe(torch.zeros((1, 100, 20))) d2l.plot(torch.arange(100), Y[0, :, 4:8].T, figsize=(6, 2.5), legend=["dim %d" % p for p in [4, 5, 6, 7]]) # %% class EncoderBlock(nn.Module): def __init__(self, key_size, query_size, value_size,num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, dropout, use_bias=False, **kwargs): super(EncoderBlock, self).__init__(**kwargs) self.attention = MultiHeadAttention(key_size, query_size, value_size, num_hiddens,num_heads,dropout, use_bias) self.addnorm1 = AddNorm(norm_shape, dropout) self.ffn = PositionWiseFFN( ffn_num_input, ffn_num_hiddens, num_hiddens) self.addnorm2 = AddNorm(norm_shape, dropout) def forward(self, X, valid_len):
from d2l import torch as d2l import torch import torch.nn as nn T = 1000 # Generate a total of 1000 points time = torch.arange(1, T + 1, dtype=torch.float32) x = torch.sin(0.01 * time) + torch.normal(0, 0.2, (T,)) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) tau = 4 features = torch.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i: T - tau + i] labels = d2l.reshape(x[tau:], (-1, 1)) batch_size, n_train = 16, 600 # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]),batch_size, is_train=True) # Function for initializing the weights of the network def init_weights(m): if type(m) == nn.Linear: torch.nn.init.xavier_uniform_(m.weight) # A simple MLP def get_net(): net = nn.Sequential(nn.Linear(4, 10), nn.ReLU(), nn.Linear(10, 1)) net.apply(init_weights) return net
vocab = d2l.Vocab(corpus) freqs = [freq for _, freq in vocab.token_freqs] bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])] bigram_vocab = d2l.Vocab(bigram_tokens) print(bigram_vocab.token_freqs[:10]) bifreqs = [freq for _, freq in bigram_vocab.token_freqs] trigram_tokens = [tup for tup in zip(corpus[:-2], corpus[1:-1], corpus[2:])] trigram_vocab = d2l.Vocab(trigram_tokens) print(trigram_vocab.token_freqs[:10]) trifreqs = [freq for _, freq in trigram_vocab.token_freqs] d2l.plot([freqs, bifreqs, trifreqs], xlabel="token: x", ylabel="frequency: n(x)", xscale="log", yscale="log", legend=["unigram", "bigram", "trigram"]) d2l.plt.show() def seq_data_iter_random(corpus, batch_size, num_steps): corpus = corpus[random.randint(0, num_steps - 1):] num_subseqs = (len(corpus) - 1) // num_steps initial_indices = list(range(0, num_subseqs * num_steps, num_steps)) random.shuffle(initial_indices) def data(pos): return corpus[pos:pos + num_steps] num_batches = num_subseqs // batch_size
def normal(x, mu, sigma): p = 1 / math.sqrt(2 * math.pi * sigma**2) return p * np.exp(-0.5 / sigma**2 * (x - mu)**2) # Use numpy again for visualization x = np.arange(-7, 7, 0.01) # Mean and standard deviation pairs params = [(0, 1), (0, 2), (3, 1)] d2l.plot(x, [normal(x, mu, sigma) for mu, sigma in params], xlabel='x', ylabel='p(x)', figsize=(4.5, 2.5), legend=[f'mean {mu}, std {sigma}' for mu, sigma in params]) def synthetic_data(w, b, num_examples): #@save """Generate y = Xw + b + noise.""" X = torch.normal(0, 1, (num_examples, len(w))) y = torch.matmul(X, w) + b y += torch.normal(0, 0.01, y.shape) return X, y.reshape((-1, 1)) true_w = torch.tensor([2, -3.4]) true_b = 4.2
if type(m) == nn.Linear: nn.init.normal_(m.weight, std=0.01) net.apply(init_weights); loss = nn.CrossEntropyLoss(reduction='none') trainer = torch.optim.SGD(net.parameters(), lr=0.1) num_epochs = 10 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer) # 多层感知机 x = torch.arange(-8.0, 8.0, 0.1, requires_grad=True) y = torch.relu(x) d2l.plot(x.detach(), y.detach(), 'x', 'relu(x)', figsize=(5, 2.5)) def relu(X): a = torch.zeros_like(X) return torch.max(X, a) loss = nn.CrossEntropyLoss(reduction='none') num_epochs, lr = 10, 0.1 updater = torch.optim.SGD(params, lr=lr) d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, updater) net = nn.Sequential(nn.Flatten(), nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))
from d2l import torch as d2l import matplotlib.pyplot as plt import torch from torch import nn from RNNModel import Numeric T = 1000 # Generate a total of 1000 points time = d2l.arange(1, T + 1, dtype=d2l.float32) x = d2l.sin(0.01 * time) + d2l.normal(0, 0.2, (T, )) ax = plt.axes() d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3), axes=ax) batch_size = 16 train_seq_len, pred_seq_len = 360, 360 n_train = 500 n_train -= n_train % batch_size τ = train_seq_len features = d2l.zeros((T - τ, τ)) labels = d2l.zeros((T - τ, τ)) for i in range(τ): features[:, i] = x[i:T - τ + i] labels[:, i] = x[i + 1:T - τ + i + 1] # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) def get_net_gru(hidden_size, input_size, output_size):
#%% from d2l import torch as d2l import torch import random tokens = d2l.tokenize(d2l.read_time_machine()) # Since each text line is not necessisarily a sentence or a paragraph, we # concatenate all text lines corpus = [token for line in tokens for token in line] vocab = d2l.Vocab(corpus) vocab.token_freqs[:10] # %% freqs = [freq for token, freq in vocab.token_freqs] d2l.plot(freqs, xlabel='token: x', ylabel='frequency: n(x)', xscale='log', yscale='log') # %% bigram_tokens = [pair for pair in zip(corpus[:-1],corpus[1:])] bigram_vocab = d2l.Vocab(bigram_tokens) bigram_vocab.token_freqs[:10] #%% trigram_tokens = [triple for triple in zip( corpus[:-2], corpus[1:-1], corpus[2:])] trigram_vocab = d2l.Vocab(trigram_tokens) trigram_vocab.token_freqs[:10] # %% bigram_freqs = [freq for token, freq in bigram_vocab.token_freqs] trigram_freqs = [freq for token, freq in trigram_vocab.token_freqs] d2l.plot([freqs, bigram_freqs, trigram_freqs], xlabel='token: x', ylabel='frequency: n(x)', xscale='log', yscale='log', legend=['unigram', 'bigram', 'trigram'])
# used an rnn in an n-gram style :( # THIS IS PROBABLY NOT OPTIMAL from d2l import torch as d2l import matplotlib.pyplot as plt import torch from torch import nn from RNNModel import Numeric #@tab mxnet, pytorch T = 1000 # Generate a total of 1000 points time = d2l.arange(1, T + 1, dtype=d2l.float32) x = d2l.sin(0.01 * time) + d2l.normal(0, 0.2, (T, )) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) #@tab mxnet, pytorch tau = 30 features = d2l.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i:T - tau + i] labels = d2l.reshape(x[tau:], (-1, 1)) batch_size = 16 n_train = 600 n_train -= n_train % batch_size # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True)
#%% %matplotlib inline import torch from torch import nn from d2l import torch as d2l T = 1000 # Generate a total of 1000 points time = torch.arange(1, T + 1, dtype=torch.float32) x = torch.sin(0.01 * time) + torch.normal(0, 0.2, (T,)) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(20, 8)) # %% tau = 4 features = torch.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i:T - tau + i] labels = x[tau:].reshape((-1, 1)) batch_size, n_train = 16, 600 # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) # %% nn.init..xavier_uniform_ # Function for initializing the weights of the network def init_weights(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) # A simple MLP
#%% from d2l import torch as d2l import torch import torch.nn as nn #%% T = 1000 time = torch.arange(1, T + 1, dtype=torch.float32) x = torch.sin(0.01 * time) + torch.normal(0, 0.2, (T, )) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) # %%
def train(net, train_iter, loss, epochs, lr): trainer = torch.optim.Adam(net.parameters(), lr) for epoch in range(epochs): for X, y in train_iter: trainer.zero_grad() l = loss(net(X), y) l.backward() trainer.step() print( f"epoch: {epoch + 1}, loss: {d2l.evaluate_loss(net, train_iter, loss):f}" ) net = get_net() train(net, train_iter, loss, 5, 0.01) onestep_preds = net(features) multistep_preds = torch.zeros(T) multistep_preds[:n_train + tau] = x[:n_train + tau] for i in range(n_train + tau, T): multistep_preds[i] = net(multistep_preds[i - tau:i].reshape((1, -1))) d2l.plot([time, time[tau:], time[n_train + tau:]], [ x.detach().numpy(), onestep_preds.detach().numpy(), multistep_preds[n_train + tau:].detach().numpy() ], "time", "x", ["data", "1-step preds", "multi-step preds"], [1, 1000], figsize=(6, 3)) d2l.plt.show()
batch_size = 16 n_train = 600 n_train -= n_train%batch_size # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) def get_net_gru(num_hiddens=256): # input_size := "feature dimensions" rnn_layer = nn.RNN(input_size=1, hidden_size=num_hiddens) net = Numeric(rnn_layer, output_size=1) return net net = get_net_gru(256) # device = d2l.try_gpu() device='cpu' net.train(net, train_iter, lr=1, num_epochs=10, device=device) num_preds=64 preds = net.predict(features[:n_train], num_preds=num_preds, device='cpu') domain = n_train + num_preds d2l.plot( [time[:domain], time[n_train:domain]], [x[:domain] , preds.detach().numpy()], legend=['orig-seq', 'predictions'], xlim=[0, domain], figsize=(6,3) ) plt.show()