def train(net_D, net_G, data_iter, num_epochs, lr, latent_dim, device=d2l.try_gpu()): loss = torch.nn.BCEWithLogitsLoss() # net_D.initialize(init=init.Normal(0.02), force_reinit=True, ctx=device) # net_G.initialize(init=init.Normal(0.02), force_reinit=True, ctx=device) trainer_hp = {'lr': lr, 'betas': [0.5, 0.999]} trainer_D = torch.optim.Adam(net_D.parameters(), **trainer_hp) trainer_G = torch.optim.Adam(net_G.parameters(), **trainer_hp) for epoch in range(1, num_epochs + 1): print('Epoch', epoch) # Train one epoch timer = d2l.Timer() metric = d2l.Accumulator(3) # loss_D, loss_G, num_examples for X, _ in data_iter: print('Processing batch') batch_size = X.shape[0] Z = torch.normal(0, 1, size=(batch_size, latent_dim, 1, 1)) # X, Z = X.as_in_ctx(device), Z.as_in_ctx(device), metric.add(update_D(X, Z, net_D, net_G, loss, trainer_D), update_G(Z, net_D, net_G, loss, trainer_G), batch_size) # Show the losses loss_D, loss_G = metric[0] / metric[2], metric[1] / metric[2] print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}') print(f'loss_D {loss_D:.3f}, loss_G {loss_G:.3f}, ' f'{metric[2] / timer.stop():.1f} examples/sec on {str(device)}')
def train_epoch_ch8(model, train_iter, loss, updater, device, #@save use_random_iter): """Train a model within one epoch (defined in Chapter 8).""" state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens for X, Y in train_iter: if state is None or use_random_iter: # Initialize `state` when either it is the first iteration or # using random sampling state = model.begin_state(batch_size=X.shape[0], device=device) else: if isinstance(model, nn.Module) and not isinstance(state, tuple): # `state` is a tensor for `nn.GRU` state.detach_() else: # `state` is a tuple of tensors for `nn.LSTM` and # for our custom scratch implementation for s in state: s.detach_() y = Y.T.reshape(-1) X, y = X.to(device), y.to(device) y_hat, state = model(X, state) l = loss(y_hat, y.long()).mean() if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() grad_clipping(model, 1) updater.step() else: l.backward() grad_clipping(model, 1) # Since the `mean` function has been invoked updater(batch_size=1) metric.add(l * d2l.size(y), d2l.size(y)) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter): state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) for X, Y in train_iter: if state is None or use_random_iter: state = net.begin_state(batch_size=X.shape[0], device=device) else: if isinstance(net, nn.Module) and not isinstance(state, tuple): state.detach_() else: for s in state: s.detach_() y = Y.T.reshape(-1) X, y = X.to(device), y.to(device) y_hat, state = net(X, state) l = loss(y_hat, y.long()).mean() if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() grad_clipping(net, 1) updater.step() else: l.backward() grad_clipping(net, 1) updater(batch_size=1) metric.add(l * y.numel(), y.numel()) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter): state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) # 训练损失之和, 标记数量 for X, Y in train_iter: if state is None or use_random_iter: # 在第一次迭代或使用随机抽样时初始化`state` state = net.begin_state(batch_size=X.shape[0], device=device) # 初始化其实隐藏状态H0 else: if isinstance(net, nn.Module) and not isinstance(state, tuple): # `state`对于`nn.GRU`是个张量 state.detach_() else: # `state`对于`nn.LSTM`或对于我们从零开始实现的模型是个张量 for s in state: s.detach_() y = Y.T.reshape(-1) X, y = X.to(device), y.to(device) y_hat, state = net(X, state) # __call__ l = loss(y_hat, y.long()).mean() if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() grad_clipping(net, 1) updater.step() else: l.backward() grad_clipping(net, 1) # 因为已经调用了`mean`函数 updater(batch_size=1) metric.add(l * y.numel(), y.numel()) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
def train(net, data_iter, lr, num_epochs, device=d2l.try_gpu()): def init_weights(m): if type(m) == nn.Embedding: nn.init.xavier_uniform_(m.weight) net.apply(init_weights) net = net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr) animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, num_epochs]) metric = d2l.Accumulator(2) # sum of losses, no. of tokens for epoch in range(num_epochs): timer, num_batches = d2l.Timer(), len(data_iter) for i, batch in enumerate(data_iter): optimizer.zero_grad() center, context_negative, mask, label = [ data.to(device) for data in batch ] pred = skip_gram(center, context_negative, net[0], net[1]) l = (loss(pred.reshape(label.shape).float(), label.float(), mask) / mask.sum(axis=1) * mask.shape[1]) l.sum().backward() optimizer.step() metric.add(l.sum(), l.numel()) if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (metric[0] / metric[1], )) print(f'loss {metric[0] / metric[1]:.3f}, ' f'{metric[1] / timer.stop():.1f} tokens/sec on {str(device)}')
def evaluate_accuracy_gpu(net, data_iter, device=None): net.eval() if not device: device = next(iter(net.parameters())).device metric = d2l.Accumulator(2) for X, y in data_iter: X, y = X.to(device), y.to(device) metric.add(d2l.accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1]
def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period, lr_decay): # Only train the small custom output network net = nn.DataParallel(net, device_ids=devices).to(devices[0]) trainer = torch.optim.SGD( (param for param in net.parameters() if param.requires_grad), lr=lr, momentum=0.9, weight_decay=wd) scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay) num_batches, timer = len(train_iter), d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['train loss', 'valid loss']) for epoch in range(num_epochs): metric = d2l.Accumulator(2) for i, (img1s, img2s, labels) in enumerate(train_iter): timer.start() # push onto gpu img1s = img1s.to(devices[0]) img2s = img2s.to(devices[0]) labels = labels.to(devices[0]) trainer.zero_grad() import ipdb ipdb.set_trace() # TODO BREAKPOINT # TODO YOU ARE HERE - choose how to cat the imgs # Then prepare the network to accept... # ipdb> torch.cat((img1s, img2s), 1).shape # torch.Size([4, 6, 224, 224]) # ipdb> torch.cat((img1s, img2s), 0).shape # torch.Size([8, 3, 224, 224]) # ipdb> torch.cat((img1s, img2s), 3).shape # torch.Size([4, 3, 224, 448]) output1s = net(img1s) output2s = net(img2s) l = loss(output, labels).sum() l.backward() trainer.step() metric.add(l, labels.shape[0]) timer.stop() if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (metric[0] / metric[1], None)) if valid_iter is not None: valid_loss = evaluate_loss(valid_iter, net, devices) animator.add(epoch + 1, (None, valid_loss)) scheduler.step() if valid_iter is not None: print(f'train loss {metric[0] / metric[1]:.3f}, ' f'valid loss {valid_loss:.3f}') else: print(f'train loss {metric[0] / metric[1]:.3f}') print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}')
def evaluate_loss(net, data_iter, loss): #@save """Evaluate the loss of a model on the given dataset.""" metric = d2l.Accumulator(2) # Sum of losses, no. of examples for X, y in data_iter: out = net(X) y = y.reshape(out.shape) l = loss(out, y) metric.add(l.sum(), l.numel()) return metric[0] / metric[1]
def train_model(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): # for idx, (X, y) in enumerate(train_iter): """Train and evaluate a model with CPU or GPU.""" def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: torch.nn.init.xavier_uniform_(m.weight) # Part 2.2 net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.BCELoss() animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer = d2l.Timer() for epoch in range(num_epochs): metric = d2l.Accumulator(3) # train_loss, train_acc, num_examples for i, (X, y) in enumerate(train_iter): timer.start() net.train() optimizer.zero_grad() X = X.float() X, y = X.to(device), y.to(device) output = net(X) # y_hat = torch.round(torch.exp(output)/(1+torch.exp(output))) y_hat = torch.sigmoid(output) y = y.to(torch.float) y = torch.unsqueeze(y, 1) l = loss(y_hat, y.type(torch.float32)) #.type(torch.float32) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_loss, train_acc = metric[0] / metric[2], metric[1] / metric[2] if (i + 1) % 50 == 0: animator.add(epoch + i / len(train_iter), (train_loss, train_acc, None)) print( "BatchNo.=%3i, Epoch No.=%3i, loss=%.3f, train acc=%.3f" % (i + 1, epoch + 1, train_loss, train_acc)) test_acc = evaluate_accuracy_gpu(net, test_iter) print("test_acc=", test_acc) animator.add(epoch + 1, (None, None, test_acc)) print('loss %.3f, train acc %.3f, test acc %.3f' % (train_loss, train_acc, test_acc)) print('%.1f examples/sec on %s' % (metric[2] * num_epochs / timer.sum(), device))
def evaluate_accuracy_gpu(net, data_iter, device=None): #@save if not device: device = next(iter(net.parameters())).device metric = d2l.Accumulator(2) # num_corrected_examples, num_examples for X, y in data_iter: X = X.float() X, y = X.to(device), y.to(device) # metric.add(d2l.accuracy(torch.sigmoid(net(X)), y), sum(y.shape)) acc = (torch.round(torch.sigmoid(net(X)).squeeze()) == y).sum().item() / len(y) # return metric[0] / metric[1] return acc
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device): """用GPU训练模型(在第六章定义) Defined in :numref:`sec_lenet`""" print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer = d2l.Timer() num_batches = len(train_iter) multiplier_anim = max(1, num_batches // 5) multiplier_save = max(1, num_epochs // 10) for epoch in tqdm(range(num_epochs)): # 训练损失之和,训练准确率之和,样本数 metric = d2l.Accumulator(3) net.train() for i, (X, y) in enumerate(train_iter): timer.start() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_l = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % multiplier_anim == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (train_l, train_acc, None)) if test_iter is not None: test_acc = d2l.evaluate_accuracy_gpu(net, test_iter) else: test_acc = 0 animator.add(epoch + 1, (None, None, test_acc)) # save net state_dict if epoch == epochs - 1 or (epoch + 1) % multiplier_save == 0: net_param_file = 'net_param_{:d}.pth'.format(epoch) path_net = os.path.join(net_save_dir, net_param_file) torch.save(net.state_dict(), path_net) print('net param saved to \n\t{}'.format(path_net)) print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}')
def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device): """Train a model for sequence to sequence.""" def xavier_init_weights(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) if type(m) == nn.GRU: for param in m._flat_weights_names: if "weight" in param: nn.init.xavier_uniform_(m._parameters[param]) net.apply(xavier_init_weights) net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr) loss = MaskedSoftmaxCELoss() net.train() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[10, num_epochs]) for epoch in range(num_epochs): timer = d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens first = True for i, batch in enumerate(data_iter): X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch] bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1) # 4. In training, replace teacher forcing with feeding the prediction at the previous time step into the # decoder. How does this influence the performance? if first: dec_input = d2l.concat([bos, Y[:, :-1]], 1) # Teacher forcing first = False else: dec_input = Y_hat.argmax(dim=2) dec_input = dec_input[:X.shape[0]] Y_hat, _ = net(X, dec_input, X_valid_len) l = loss(Y_hat, Y, Y_valid_len) l.sum().backward() # Make the loss scalar for `backward` d2l.grad_clipping(net, 1) num_tokens = Y_valid_len.sum() optimizer.step() with torch.no_grad(): metric.add(l.sum(), num_tokens) if (epoch + 1) % 10 == 0: animator.add(epoch + 1, (metric[0] / metric[1], )) print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}')
def evaluate_accuracy_gpu(net, data_iter, device=None): #@save """Compute the accuracy for a model on a dataset using a GPU.""" if isinstance(net, torch.nn.Module): net.eval() # Set the model to evaluation mode if not device: device = next(iter(net.parameters())).device # No. of correct predictions, no. of predictions metric = d2l.Accumulator(2) for X, y in data_iter: if isinstance(X, list): # Required for BERT Fine-tuning (to be covered later) X = [x.to(device) for x in X] else: X = X.to(device) y = y.to(device) metric.add(d2l.accuracy(net(X), y), y.numel()) return metric[0] / metric[1]
def evaluate_accuracy_gpu(net, data_iter, device=None): #@save """使用GPU计算模型在数据集上的精度""" if isinstance(net, nn.Module): net.eval() # 设置为评估模式 if not device: device = next(iter(net.parameters())).device # 正确预测的数量,总预测的数量 metric = d2l.Accumulator(2) with torch.no_grad(): for X, y in data_iter: if isinstance(X, list): # BERT微调所需的(之后将介绍) X = [x.to(device) for x in X] else: X = X.to(device) y = y.to(device) metric.add(d2l.accuracy(net(X), y), y.numel()) return metric[0] / metric[1]
def train_bert(train_iter, net, loss, vocab_size, devices, num_steps): net = nn.DataParallel(net, device_ids=devices).to(devices[0]) trainer = torch.optim.Adam(net.parameters(), lr=1e-3) step, timer = 0, d2l.Timer() animator = d2l.Animator(xlabel='step', ylabel='loss', xlim=[1, num_steps], legend=['mlm', 'nsp']) # Sum of masked language modeling losses, sum of next sentence prediction # losses, no. of sentence pairs, count metric = d2l.Accumulator(4) num_steps_reached = False while step < num_steps and not num_steps_reached: for tokens_X, segments_X, valid_lens_x, pred_positions_X,\ mlm_weights_X, mlm_Y, nsp_y in train_iter: tokens_X = tokens_X.to(devices[0]) segments_X = segments_X.to(devices[0]) valid_lens_x = valid_lens_x.to(devices[0]) pred_positions_X = pred_positions_X.to(devices[0]) mlm_weights_X = mlm_weights_X.to(devices[0]) mlm_Y, nsp_y = mlm_Y.to(devices[0]), nsp_y.to(devices[0]) trainer.zero_grad() timer.start() mlm_l, nsp_l, l = _get_batch_loss_bert(net, loss, vocab_size, tokens_X, segments_X, valid_lens_x, pred_positions_X, mlm_weights_X, mlm_Y, nsp_y) l.backward() trainer.step() metric.add(mlm_l, nsp_l, tokens_X.shape[0], 1) timer.stop() animator.add(step + 1, (metric[0] / metric[3], metric[1] / metric[3])) step += 1 if step == num_steps: num_steps_reached = True break print(f'MLM loss {metric[0] / metric[3]:.3f}, ' f'NSP loss {metric[1] / metric[3]:.3f}') print(f'{metric[2] / timer.sum():.1f} sentences pairs/sec on ' f'{str(devices)}')
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): """Train a model with a GPU (defined in Chapter 6).""" def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: nn.init.xavier_uniform_(m.weight) net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer, num_batches = d2l.Timer(), len(train_iter) for epoch in range(num_epochs): # Sum of training loss, sum of training accuracy, no. of examples metric = d2l.Accumulator(3) net.train() for i, (X, y) in enumerate(train_iter): timer.start() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_l = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (train_l, train_acc, None)) test_acc = evaluate_accuracy_gpu(net, test_iter) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}') plt.show()
def train_func(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): def init_weights(m): if type(m) == nn.Linear or type(m) == nn.Conv2d: torch.nn.init.xavier_uniform_(m.weight) net.apply(init_weights) print('training on', device) net.to(device) optimizer = torch.optim.SGD(net.parameters(), lr=lr) loss = nn.CrossEntropyLoss() timer = d2l.Timer() for epoch in range(num_epochs): metric = d2l.Accumulator(3) for i, (X, y) in enumerate(train_iter): timer.start() net.train() optimizer.zero_grad() X, y = X.to(device), y.to(device) y_hat = net(X) l = loss(y_hat, y) l.backward() optimizer.step() with torch.no_grad(): metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_loss = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % 50 == 0: print(f"epoch: {epoch} --- iter: {i} --- of {len(train_iter)}") print(f"train loss: {train_loss} --- train acc: {train_acc}") test_acc = evaluate_accuracy_gpu(net, test_iter) print(f'loss {train_loss:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}')
def train_bert(train_iter, net, loss, vocab_size, devices, num_steps): def xavier_init_weights(m): if type(m) is nn.Linear and\ len(m.weight.shape)>1: torch.nn.init.xavier_uniform_(m.weight) net.apply(xavier_init_weights) optimizer = torch.optim.Adam(net.parameters()) step, timer = 0, d2l.Timer() animator = d2l.Animator(xlabel='step', ylabel='loss', xlim=[1, num_steps], legend=['mlm', 'nsp']) # Sum of masked language modeling losses, sum of next sentence prediction # losses, no. of sentence pairs, count metric = d2l.Accumulator(4) num_steps_reached = False while step < num_steps and not num_steps_reached: for batch in train_iter: # temp = list((batch[i][j] for j in range(len(batch[0])) for i in range(len(batch)))) # (tokens_X_shards,segments_X_shards, valid_lens_x_shards,\ # pred_positions_X_shards, mlm_weights_X_shards,\ # mlm_Y_shards, nsp_y_shards)= tuple(temp[i:i+batch_size] for i in range(0,len(temp),batch_size)) timer.start() optimizer.zero_grad() mlm_ls, nsp_ls, ls = _get_batch_loss_bert(batch) for l in ls: l.backward() optimizer.step() timer.stop() animator.add(step + 1, (metric[0] / metric[3], metric[1] / metric[3])) step += 1 if step == num_steps: num_steps_reached = True break print(f'MLM loss {metric[0] / metric[3]:.3f}, ' f'NSP loss {metric[1] / metric[3]:.3f}') print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on ' f'{str(devices)}')
def train_epoch(self, net, train_iter, loss, updater, device, use_random_iter): """Train a net within one epoch (defined in Chapter 8).""" state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens i = 0 for X, Y in train_iter: X = X.to(torch.float32) # TODO is this necessary? Y = Y.to(torch.float32) X = X.reshape(-1, *X.shape) # [direction, batch_size, seq_len] if state is None or use_random_iter: # Initialize `state` when either it is the first iteration or # using random sampling state = net.begin_state(batch_size=X.shape[1], device=device) else: if isinstance(net, nn.Module) and not isinstance(state, tuple): # `state` is a tensor for `nn.GRU` state.detach_() else: # `state` is a tuple of tensors for `nn.LSTM` and # for our custom scratch implementation for s in state: s.detach_() # y = Y.T.reshape(-1) X, Y = X.to(device), Y.to(device) y_hat, state = net(X, state) l = loss(y_hat, Y).mean() if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() self.grad_clipping(net, 1) updater.step() else: l.backward() self.grad_clipping(net, 1) # Since the `mean` function has been invoked updater(batch_size=1) metric.add(l * d2l.size(Y), d2l.size(Y)) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
def train(net, train_iter, valid_iter, num_epochs, lr, wd, devices, lr_period, lr_decay): # Only train the small custom output network net = nn.DataParallel(net, device_ids=devices).to(devices[0]) trainer = torch.optim.SGD((param for param in net.parameters() if param.requires_grad), lr=lr, momentum=0.9, weight_decay=wd) scheduler = torch.optim.lr_scheduler.StepLR(trainer, lr_period, lr_decay) num_batches, timer = len(train_iter), d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['train loss', 'valid loss']) for epoch in range(num_epochs): metric = d2l.Accumulator(2) for i, (features, labels) in enumerate(train_iter): timer.start() features, labels = features.to(devices[0]), labels.to(devices[0]) trainer.zero_grad() output = net(features) l = loss(output, labels).sum() l.backward() trainer.step() metric.add(l, labels.shape[0]) timer.stop() if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (metric[0] / metric[1], None)) if valid_iter is not None: valid_loss = evaluate_loss(valid_iter, net, devices) animator.add(epoch + 1, (None, valid_loss)) scheduler.step() if valid_iter is not None: print(f'train loss {metric[0] / metric[1]:.3f}, ' f'valid loss {valid_loss:.3f}') else: print(f'train loss {metric[0] / metric[1]:.3f}') print(f'{metric[1] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}')
def train_s2s_ch9(model, data_iter, lr, num_epochs, tgt_vocab, device): """Train a model for sequence to sequence (defined in Chapter 9).""" def xavier_init_weights(m): if type(m) == nn.Linear: torch.nn.init.xavier_uniform_(m.weight) if type(m) == nn.GRU: for param in m._flat_weights_names: if "weight" in param: torch.nn.init.xavier_uniform_(m._parameters[param]) model.apply(xavier_init_weights) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss = MaskedSoftmaxCELoss() model.train() # animator = d2l.Animator(xlabel='epoch', ylabel='loss', # xlim=[10, num_epochs]) for epoch in range(num_epochs): timer = d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens for batch in data_iter: X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch] bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1) dec_input = torch.cat([bos, Y[:, :-1]], 1) # Teacher forcing Y_hat, _ = model(X, dec_input, X_valid_len) l = loss(Y_hat, Y, Y_valid_len) l.sum().backward() # Make the loss scalar for `backward` d2l.grad_clipping(model, 1) num_tokens = Y_valid_len.sum() optimizer.step() with torch.no_grad(): metric.add(l.sum(), num_tokens) # if (epoch + 1) % 10 == 0: # animator.add(epoch + 1, (metric[0] / metric[1],)) print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}')
def train(resume_training=True): EMBEDDING_SIZE = 32 num_hiddens, num_layers, dropout, batch_size, num_steps = EMBEDDING_SIZE, 2, 0.1, 64, 10 lr, num_epochs, device = 0.005, 1000, d2lt.try_gpu() ffn_num_input, ffn_num_hiddens, num_heads = EMBEDDING_SIZE, 64, 4 key_size, query_size, value_size = EMBEDDING_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE norm_shape = [EMBEDDING_SIZE] ### Load data data_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps) encoder = TransformerEncoder(len(src_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) decoder = TransformerDecoder(len(tgt_vocab), key_size, query_size, value_size, num_hiddens, norm_shape, ffn_num_input, ffn_num_hiddens, num_heads, num_layers, dropout) ### Load model model = EncoderDecoder(encoder, decoder).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) ### Load checkpoint if resume_training and PATH_MODEL.exists( ) and os.path.getsize(PATH_MODEL) > 0: model, optimizer, last_epoch = load_checkpoint(model, optimizer) print("Continue training from last checkpoint...") else: if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) with open(PATH_MODEL, 'w') as fp: pass print( 'No prior checkpoint existed, created new save files for checkpoint.' ) model.apply(xavier_init_weights) last_epoch = 0 # model.apply(xavier_init_weights) # model.to(device) # optimizer = torch.optim.Adam(model.parameters(), lr=lr) ### Initialize Loss functions loss = MaskedSoftmaxCELoss() ### Train model.train() # animator = d2lt.Animator(xlabel='epoch', ylabel='loss', # xlim=[10, num_epochs]) for epoch in range(last_epoch, num_epochs): timer = d2lt.Timer() metric = d2lt.Accumulator(2) # Sum of training loss, no. of tokens for batch in data_iter: X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch] bos = torch.tensor([tgt_vocab['<bos>']] * Y.shape[0], device=device).reshape(-1, 1) dec_input = torch.cat([bos, Y[:, :-1]], 1) # Teacher forcing Y_hat, _ = model(X, dec_input, X_valid_len) l = loss(Y_hat, Y, Y_valid_len) l.sum().backward() # Make the loss scalar for `backward` d2lt.grad_clipping(model, 1) num_tokens = Y_valid_len.sum() optimizer.step() with torch.no_grad(): metric.add(l.sum(), num_tokens) if (epoch + 1) % 10 == 0: # animator.add(epoch + 1, (metric[0] / metric[1],)) print(f'epoch {epoch + 1} - ' f'loss {metric[0] / metric[1]:.5f}') ### Save checkpoint save_checkpoint(epoch, model, optimizer) print(f'loss {metric[0] / metric[1]:.5f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}')