def get_params(vocab_size, num_hiddens, device): num_inputs = num_outputs = vocab_size def normal(shape): return torch.randn(size=shape, device=device) * 0.01 # Hidden layer parameters W_xh = normal((num_inputs, num_hiddens)) W_hh = normal((num_hiddens, num_hiddens)) b_h = d2l.zeros(num_hiddens, device=device) # Output layer parameters W_hq = normal((num_hiddens, num_outputs)) b_q = d2l.zeros(num_outputs, device=device) # Attach gradients params = [W_xh, W_hh, b_h, W_hq, b_q] for param in params: param.requires_grad_(True) return params
def get_params(vocab_size, num_hiddens, device): num_inputs = num_outputs = vocab_size def normal(shape): return torch.randn(size=shape, device=device) * 0.01 def three(): return (normal( (num_inputs, num_hiddens)), normal((num_hiddens, num_hiddens)), d2l.zeros(num_hiddens, device=device)) W_xz, W_hz, b_z = three() # Update gate parameters # W_xr, W_hr, b_r = three() # Reset gate parameters W_xh, W_hh, b_h = three() # Candidate hidden state parameters # Output layer parameters W_hq = normal((num_hiddens, num_outputs)) b_q = d2l.zeros(num_outputs, device=device) # Attach gradients params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q] for param in params: param.requires_grad_(True) return params
def three(): return (normal( (num_inputs, num_hiddens)), normal((num_hiddens, num_hiddens)), d2l.zeros(num_hiddens, device=device))
from d2l import torch as d2l import matplotlib.pyplot as plt import torch from torch import nn #@tab mxnet, pytorch T = 1000 # Generate a total of 1000 points time = d2l.arange(1, T + 1, dtype=d2l.float32) x = d2l.sin(0.01 * time) + d2l.normal(0, 0.2, (T, )) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) plt.show() #@tab mxnet, pytorch tau = 4 features = d2l.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i:T - tau + i] labels = d2l.reshape(x[tau:], (-1, 1)) batch_size, n_train = 16, 600 # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) # Function for initializing the weights of the network def init_weights(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight)
def init_rnn_state(batch_size, num_hiddens, device): return (d2l.zeros((batch_size, num_hiddens), device=device), )
X = self.embedding(X) # In RNN models, the first axis corresponds to time steps X = X.permute(1, 0, 2) # When state is not mentioned, it defaults to zeros output, state = self.rnn(X) # `output` shape: (`num_steps`, `batch_size`, `num_hiddens`) # `state` shape: (`num_layers`, `batch_size`, `num_hiddens`) return output, state encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2) encoder.eval() X = d2l.zeros((4, 7), dtype=torch.long) output, state = encoder(X) output.shape state.shape class Seq2SeqDecoder(d2l.Decoder): """The RNN decoder for sequence to sequence learning.""" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs):
from d2l import torch as d2l import matplotlib.pyplot as plt import torch from torch import nn from RNNModel import Numeric #@tab mxnet, pytorch T = 1000 # Generate a total of 1000 points time = d2l.arange(1, T + 1, dtype=d2l.float32) x = d2l.sin(0.01 * time) + d2l.normal(0, 0.2, (T, )) d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3)) #@tab mxnet, pytorch tau = 30 features = d2l.zeros((T - tau, tau)) for i in range(tau): features[:, i] = x[i:T - tau + i] labels = d2l.reshape(x[tau:], (-1, 1)) batch_size = 16 n_train = 600 n_train -= n_train % batch_size # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) # Function for initializing the weights of the network def init_weights(m):
@property def attention_weights(self): return self._attention_weights encoder = d2l.Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2) encoder.eval() decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2) decoder.eval() X = d2l.zeros((4, 7), dtype=torch.long) # (`batch_size`, `num_steps`) state = decoder.init_state(encoder(X), None) output, state = decoder(X, state) output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.1 batch_size, num_steps = 64, 10 lr, num_epochs, device = 0.005, 250, d2l.try_gpu() train_iter, src_vocab, tgt_vocab = d2l.load_data_nmt(batch_size, num_steps) encoder = d2l.Seq2SeqEncoder(len(src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder(len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout) net = d2l.EncoderDecoder(encoder, decoder) d2l.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
from torch import nn from RNNModel import Numeric T = 1000 # Generate a total of 1000 points time = d2l.arange(1, T + 1, dtype=d2l.float32) x = d2l.sin(0.01 * time) + d2l.normal(0, 0.2, (T, )) ax = plt.axes() d2l.plot(time, [x], 'time', 'x', xlim=[1, 1000], figsize=(6, 3), axes=ax) batch_size = 16 train_seq_len, pred_seq_len = 360, 360 n_train = 500 n_train -= n_train % batch_size τ = train_seq_len features = d2l.zeros((T - τ, τ)) labels = d2l.zeros((T - τ, τ)) for i in range(τ): features[:, i] = x[i:T - τ + i] labels[:, i] = x[i + 1:T - τ + i + 1] # Only the first `n_train` examples are used for training train_iter = d2l.load_array((features[:n_train], labels[:n_train]), batch_size, is_train=True) def get_net_gru(hidden_size, input_size, output_size): # input_size := "feature dimensions" rnn_layer = nn.RNN(input_size, hidden_size) net = Numeric(rnn_layer, output_size)