def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def train_model(model, optimizer, train_itr, EN_TEXT, FR_TEXT, epochs=10000, print_every=100): # tell the model, we are training the model # b/c dropout, batch norm behave differently on the train and test procedures model.train() start = time.time() prev = start total_loss = 0 for ep in range(epochs): for i, batch in enumerate(train_itr): src = batch.English.transpose(0, 1) tar = batch.French.transpose(0, 1) # the French sentence we input has all words except # the last, as it is using each word to predict the next tar_input = tar[:, :-1] targets = tar[:, 1:].contiguous().view(-1) src_mask, tar_mask = create_masks(batch, EN_TEXT, FR_TEXT) preds = model(src, tar_input, src_mask, tar_mask) optimizer.zero_grad() loss = F.cross_entropy(preds.view(-1, preds.size(-1)), targets, ignore_index=t) loss.backward() optimizer.step() total_loss += loss.data[0] if (i + 1) % print_every == 0: loss_avg = total_loss / print_every print( "time = %dm, epoch %d, iter = %d, loss = %.3f,% ds per % d iters" % ((time.time() - start) // 60, ep + 1, i + 1, loss_avg, time.time() - prev, print_every)) total_loss = 0 prev = time.time()
def evaluate(inp_sentence): start_token = [tokeinzer_pt.vocab_size] end_token = [tokenizer_pt.vocab_size + 1] # inp sentence is portuguese, hense adding the start and end token inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token encoder_input = tf.expand_dims(inp_sentence, 0) # as the target is english, the first word to the transformer should be the # english start token. decoder_input = [tokenizer_en.vocab_size] output = tf.expand_dims(decoder_input, 0) for i in range(cst.MAC_LENGTH): enc_padding_mask, combined_mask, dec_padding_mask = create_masks( encoder_input, output) # predictions.shape == (batch_size, seq_len, vocab_size) predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) # select the last word from the seq_len dimension predictions = predictions[:, -1:, :] # (batch_size, 1, vocab_size) predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) # return the result if the predicted_id is equal to the end token if predicted_id == tokenizer_en.vocab_size + 1: return tf.squeeze(output, axis=0), attention_weights # concatentate the predicted_id to the output which is given to the decoder # as its input. output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), attention_weights
def train_model(transformer, eventer, dataset, test_dataset, epochs, criterion, optimizer, SRC, TRG): print("training model...") # eval_loss1 = eval(transformer, eventer, test_dataset) # eval_loss2 = shuffle_eval(transformer, eventer, test_dataset) for epoch in range(epochs): transformer.train() eventer.train() cur_lr = get_lr(optimizer) print("Current lr ", cur_lr) total_loss = [] for index, (srcs, srcs_len, trgs, trgs_len, mask_tok, root) in enumerate(tqdm(dataset)): sent_num = srcs.size(1) srcs = srcs.cuda() mask_tok = mask_tok.cuda() root = root.cuda() # B * S trgs = trgs.cuda() trg_input = trgs[:, :-1] trg_input = trg_input.cuda() src_masks = [None] * sent_num trg_mask = None for i in range(sent_num): src_masks[i], trg_mask = create_masks(srcs[:, i], trg_input) for i in range(sent_num): src_masks[i] = src_masks[i].squeeze().cuda() src_masks = torch.stack([m for m in src_masks], dim=1) src_word_masks = src_masks.view(src_masks.size(0), 1, -1) # print(src_word_masks.size()) src_word_tok_masks = src_word_masks.repeat(1, src_word_masks.size(2), 1) # print("word_mask", src_word_tok_masks[0][0]) # print("mask_tok", mask_tok[0][0]) mask_tok = mask_tok * src_word_tok_masks.long() # print("mask_tok", mask_tok[0][0]) trg_mask = trg_mask.cuda() events = [None] * sent_num for i in range(sent_num): events[i] = transformer.encoder(srcs[:, i], src_masks[:, i].unsqueeze(1)) # print(events[i].size()) # print(src_masks[0,i]) # events[i] = pool(events[i], src_masks[:, i]) eventers = torch.cat([e for e in events], dim=-2) eventers = eventer(eventers, mask_tok) eventers = eventers.view(eventers.size(0), sent_num, -1, eventers.size(2)) feat_root = root.view(-1, 1, 1, 1).expand(-1, 1, eventers.size(2), eventers.size(3)) root_feat = torch.gather(eventers, 1, feat_root).squeeze(1) mask_root = root.view(-1, 1, 1).expand(-1, 1, src_masks.size(2)) root_masks = torch.gather(src_masks, 1, mask_root) # print(root_feat.size(), root_masks.size()) # pred = transformer.out(transformer.decoder(trg_input, sent_feats, mask_sent.unsqueeze(1), trg_mask)[0]) pred = transformer.out( transformer.decoder(trg_input, root_feat, root_masks, trg_mask)[0]) ys = trgs[:, 1:].contiguous().view(-1).cuda() optimizer.zero_grad() loss = criterion(pred.view(-1, pred.size(-1)), ys) loss.backward() torch.nn.utils.clip_grad_norm_(transformer.parameters(), 0.1) torch.nn.utils.clip_grad_norm_(eventer.parameters(), 0.1) optimizer.step() total_loss.append(loss.item()) print(f"Epoch {epoch} training loss : ", sum(total_loss) / len(total_loss)) eval_loss1 = eval(transformer, eventer, test_dataset) eval_loss2 = shuffle_eval(transformer, eventer, test_dataset) print(f"Epoch {epoch} evaluation loss : ", eval_loss1, eval_loss2) torch.save(transformer.state_dict(), f'models/transformer{epoch}.pth') torch.save(eventer.state_dict(), f'models/eventer{epoch}.pth')
def init_vars(src, root, mask_tok, transformer, eventer, SRC, TRG, beam_size, max_len): init_tok = TRG('<sos>') outputs = torch.LongTensor([[init_tok]]) outputs = outputs.cuda() trg_mask = nopeak_mask(1) trg_mask = trg_mask.cuda() ################################################## src_lens_flat = [len(s) for s in src] src_toks = torch.zeros(1, len(src), max(src_lens_flat)).long() for i, s in enumerate(src): end = src_lens_flat[i] src_toks[0, i, :end] = torch.LongTensor(s[:end]) src = src_toks src = src.cuda() sent_num = src.size(1) mask_tok = torch.LongTensor(mask_tok).unsqueeze(0) # print(mask_tok.size()) mask_tok = tile(mask_tok, 1, max(src_lens_flat)) mask_tok = tile(mask_tok, 2, max(src_lens_flat)) mask_tok = mask_tok.cuda() # B * S src_masks = [None] * sent_num for i in range(sent_num): src_masks[i], _ = create_masks(src[:, i], None) for i in range(sent_num): src_masks[i] = src_masks[i].squeeze(1).cuda() src_masks = torch.stack([m for m in src_masks], dim=1) src_word_masks = src_masks.view(src_masks.size(0), 1, -1) # print("src_word_masks", src_word_masks.size()) events = [None] * sent_num for i in range(sent_num): events[i] = transformer.encoder(src[:, i], src_masks[:, i].unsqueeze(1)) eventers = torch.cat([e for e in events], dim=-2) eventers = eventer(eventers, mask_tok) eventers = eventers.view(eventers.size(0), sent_num, -1, eventers.size(2)) # print(eventers.size()) root = torch.LongTensor([root]).cuda() feat_root = root.view(-1, 1, 1, 1).expand(-1, 1, eventers.size(2), eventers.size(3)) root_feat = torch.gather(eventers, 1, feat_root).squeeze(1) mask_root = root.view(-1, 1, 1).expand(-1, 1, src_masks.size(2)) root_masks = torch.gather(src_masks, 1, mask_root) # print(root_feat.size(), root_masks.size()) # pred = transformer.out(transformer.decoder(trg_input, sent_feats, mask_sent.unsqueeze(1), trg_mask)[0]) # print(root_masks.data.cpu().numpy()) pred = transformer.out( transformer.decoder(outputs, root_feat, root_masks, trg_mask)[0]) # out = F.softmax(pred, dim=-1) probs, ix = out[:, -1].data.topk(beam_size) log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0) outputs = torch.zeros(beam_size, max_len).long() outputs = outputs.cuda() outputs[:, 0] = init_tok outputs[:, 1] = ix[0] e_outputs = torch.zeros(beam_size, eventers.size(-2), eventers.size(-1)) e_outputs = e_outputs.cuda() e_outputs[:, :] = root_feat return outputs, e_outputs, log_scores, root_masks