#%% batch_size, max_len = 512, 64 train_iter, vocab = load_data_wiki(batch_size, max_len) #%% net = BERTModel(128, 128, 128, len(vocab), num_hiddens=128, ffn_num_hiddens=256, ffn_num_input=128, norm_shape=[128], num_heads=2, num_layers=2, dropout=0.2) devices = d2l.try_all_gpus() loss = nn.CrossEntropyLoss() def _get_batch_loss_bert(batch): mlm_ls, nsp_ls, ls = [], [], [] a = tuple(batch[i] for i in range(len(batch))) print(type(a)) print(len(a)) print(type(a[0])) for (tokens_X_shard, segments_X_shard, valid_lens_x_shard, pred_positions_X_shards, mlm_weights_X_shard, mlm_Y_shard, nsp_y_shard) in zip(a[0], a[1], a[2], a[3], a[4], a[5], a[6]): _, mlm_hat, nsp_Y_hat = net(tokens_X_shard, segments_X_shard, valid_lens_x_shard.reshape(-1), pred_positions_X_shards)
def forward(self, X): premises, hypotheses = X A = self.embedding(premises) B = self.embedding(hypotheses) beta, alpha = self.attend(A, B) V_A, V_B = self.compare(A, B, beta, alpha) Y_hat = self.aggregate(V_A, V_B) return Y_hat #%% batch_size, num_steps = 256, 50 train_iter, test_iter, vocab = d2l.load_data_snli(batch_size, num_steps) #%% embed_size, num_hiddens, devices = 100, 200, d2l.try_all_gpus() net = DecomposableAttention(vocab, embed_size, num_hiddens) glove_embedding = d2l.TokenEmbedding('glove.6b.100d') embeds = glove_embedding[vocab.idx_to_token] net.embedding.weight.data.copy_(embeds) #%% lr, num_epochs = 0.001, 4 trainer = torch.optim.Adam(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss(reduction="none") d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices) #%% #@save def predict_snli(net, vocab, premise, hypothesis): net.eval()
animator.add(epoch + (i + 1) / num_batches, (metric[0] / metric[1], None)) if valid_iter is not None: valid_loss = evaluate_loss(valid_iter, net, devices) animator.add(epoch + 1, (None, valid_loss)) scheduler.step() if valid_iter is not None: print(f'train loss {metric[0] / metric[1]:.3f}, ' f'valid loss {valid_loss:.3f}') else: print(f'train loss {metric[0] / metric[1]:.3f}') print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}') devices, num_epochs, lr, wd = d2l.try_all_gpus(), 5, 0.001, 1e-4 lr_period, lr_decay, net = 10, 0.1, get_net(devices) train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period, lr_decay) net = get_net(devices) train(net, train_valid_iter, None, num_epochs, lr, wd, devices, lr_period, lr_decay) preds = [] for data, label in test_iter: output = torch.nn.functional.softmax(net(data.to(devices[0])), dim=0) preds.extend(output.cpu().detach().numpy()) ids = sorted(os.listdir( os.path.join(data_dir, 'train_valid_test', 'test', 'unknown')))
print('output image shape:', out_img.shape) d2l.plt.imshow(out_img) d2l.plt.show() W = bilinear_kernel(num_classes, num_classes, 64) net.transpose_conv.weight.data.copy_(W) batch_size, crop_size = 32, (320, 480) train_iter, test_iter = d2l.load_data_voc(batch_size, crop_size) def loss(inputs, targets): return F.cross_entropy(inputs, targets, reduction='none').mean(1).mean(1) num_epochs, lr, wd, devices = 5, 0.001, 1e-3, d2l.try_all_gpus() trainer = torch.optim.SGD(net.parameters(), lr=lr, weight_decay=wd) d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices) def predict(img): X = test_iter.dataset.normalize_image(img).unsqueeze(0) pred = net(X.to(devices[0])).argmax(dim=1) return pred.reshape(pred.shape[1], pred.shape[2]) def label2image(pred): colormap = torch.tensor(d2l.VOC_COLORMAP, device=devices[0]) X = pred.long() return colormap[X, :]
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs): super(BiRNN, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True) self.decoder = nn.Linear(num_hiddens*4, 2) def forward(self, inputs): embeddings = self.embedding(inputs.T) self.encoder.flatten_parameters() outputs, _ = self.encoder(embeddings) encoding = torch.cat((outputs[0],outputs[-1]), dim=1) outs = self.decoder(encoding) return outs #%% embed_size, num_hiddens, num_layers, devices = 100, 100, 2, d2l.try_all_gpus() net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers) #%% def init_weights(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) if type(m) == nn.LSTM: for param in m._flat_weights_names: if "weight" in param: nn.init.xavier_uniform_(m._parameters[param]) net.apply(init_weights) #%% glove_embedding = d2l.TokenEmbedding('glove.6b.100d') embeds = glove_embedding[vocab.idx_to_token]