def validate(model, epoch, val_iterator, src_vocab, tgt_vocab, args, writer): model.eval() losses = 0 correct_words = 0 total_words = 0 with torch.no_grad(): for batch_idx, batch in tqdm(enumerate(val_iterator), total=len(val_iterator)): device = args.device src = batch.src.transpose(0, 1).to(device) tgt = batch.tgt.transpose(0, 1).to(device) src_mask = padding_mask(src, src_vocab) tgt_mask = padding_mask(tgt[:, :-1], src_vocab) & subsequent_mask( tgt[:, :-1]).to(device) out = model(src, tgt[:, :-1], src_mask, tgt_mask) labels = tgt[:, 1:].contiguous().view(-1) loss, n_correct = cal_performance(out, labels, tgt_vocab) losses += loss.item() total_words += tgt[:, 1:].ne( tgt_vocab.stoi[CONSTANTS['pad']]).sum().item() correct_words += n_correct print('(Validation) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %'.format( ppl=math.exp(losses / total_words), accu=100 * correct_words / total_words)) writer.add_scalar('val_loss', losses / total_words, epoch)
def train(model, epoch, train_iterator, optimizer, src_vocab, tgt_vocab, args, writer): model.train() losses = 0 correct_words = 0 total_words = 0 for batch_idx, batch in tqdm(enumerate(train_iterator), total=len(train_iterator)): device = args.device src = batch.src.transpose(0, 1).to(device) tgt = batch.tgt.transpose(0, 1).to(device) src_mask = padding_mask(src, src_vocab) tgt_mask = padding_mask(tgt[:, :-1], tgt_vocab) & subsequent_mask( tgt[:, :-1]).to(device) out = model(src, tgt[:, :-1], src_mask, tgt_mask) optimizer.zero_grad() labels = tgt[:, 1:].contiguous().view(-1) loss, n_correct = cal_performance(out, labels, tgt_vocab) loss.backward() optimizer.step() losses += loss.item() total_words += tgt[:, 1:].ne( tgt_vocab.stoi[CONSTANTS['pad']]).sum().item() correct_words += n_correct print('(Training) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %'.format( ppl=math.exp(losses / total_words), accu=100 * correct_words / total_words)) writer.add_scalar('train_loss', losses / total_words, epoch) return correct_words / total_words
def test(model, test_iterator, src_vocab, tgt_vocab, args, writer): model.eval() losses = 0 correct_words = 0 total_words = 0 references = [] hypotheses = [] with torch.no_grad(): for batch_idx, batch in tqdm(enumerate(test_iterator), total=len(test_iterator)): device = args.device src = batch.src.transpose(0, 1).to(device) tgt = batch.tgt.transpose(0, 1).to(device) src_mask = padding_mask(src, src_vocab) tgt_mask = padding_mask(tgt[:, :-1], src_vocab) & subsequent_mask( tgt[:, :-1]).to(device) out = model(src, tgt[:, :-1], src_mask, tgt_mask) labels = tgt[:, 1:].contiguous().view(-1) loss, n_correct = cal_performance(out, labels, tgt_vocab) losses += loss.item() total_words += tgt[:, 1:].ne( tgt_vocab.stoi[CONSTANTS['pad']]).sum().item() correct_words += n_correct # Convert target sentence into words for idxs in tgt.tolist(): references.append([[ idx for idx in idxs if idx != tgt_vocab.stoi[CONSTANTS['start']] and idx != tgt_vocab.stoi[CONSTANTS['pad']] ]]) # Convert prediction into a sentence word_idxs = torch.max(out, dim=-1)[1] for idxs in word_idxs.tolist(): hypotheses.append([ idx for idx in idxs if idx != tgt_vocab.stoi[CONSTANTS['start']] and idx != tgt_vocab.stoi[CONSTANTS['pad']] ]) bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0)) bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0)) print( '(Test) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f}%, BLEU-1: {bleu1:3.3f}, BLEU-2: {bleu2:3.3f}' .format(ppl=math.exp(losses / total_words), accu=100 * correct_words / total_words, bleu1=bleu_1, bleu2=bleu_2))
def greedy_decode(model, src, src_mask, max_len, start_symbol): memory = model.encode(src, src_mask) ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data) for i in range(max_len - 1): out = model.decode( memory, src_mask, Variable(ys), Variable(subsequent_mask(ys.size(1)).type_as(src.data))) prob = model.generator(out[:, -1]) _, next_word = torch.max(prob, dim=1) next_word = next_word.data[0] ys = torch.cat( [ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1) return ys
def __init__(self, num_chars, num_words, num_classes, bos_token, **kwargs): super().__init__() self.transformer_size = kwargs.pop('transformer_size', 512) self.bos_token = bos_token with self.init_scope(): self.feature_extractor = ResNet(kwargs.pop('num_layers', 18)) self.transformer = get_conv_feature_encoder_decoder( num_classes, N=1, model_size=self.transformer_size) self.classifier = L.Linear(self.transformer_size, num_classes) self.mask = subsequent_mask(self.transformer_size) self.num_chars = num_chars self.num_words = num_words chainer.global_config.user_text_recognition_grayscale_input = False
def predict_word(self, dec_seq, src_seq, enc_output, n_active_inst): src_mask = padding_mask(src_seq, self.src_vocab) tgt_mask = padding_mask( dec_seq, self.tgt_vocab) & subsequent_mask(dec_seq).to(self.device) dec_seq = self.model.tgt_embedding(dec_seq) * math.sqrt( self.model.d_model) dec_seq = self.model.positional_encoder2(dec_seq) dec_output = self.model.decoder(dec_seq, enc_output, src_mask, tgt_mask) dec_output = dec_output[:, -1, :] word_prob = F.log_softmax(self.model.linear(dec_output), dim=1) word_prob = word_prob.view(n_active_inst, self.beam_size, -1) return word_prob
def __init__(self, vocab_size, max_len, start_symbol, transformer_size=512): super().__init__() self.vocab_size = vocab_size self.max_len = max_len self.start_symbol = start_symbol self.transformer_size = transformer_size with self.init_scope(): model = get_encoder_decoder( vocab_size, vocab_size, N=2, model_size=transformer_size, ) self.model = model self.mask = subsequent_mask(self.transformer_size) self.classifier = L.Linear(transformer_size, vocab_size)
def __init__(self, *args, **kwargs): self.transformer_size = kwargs.pop('transformer_size', 512) super().__init__(*args, **kwargs) with self.init_scope(): positional_encoding, decoder = build_transform_param_decoder(N=1, model_size=self.transformer_size) self.positional_encoding = positional_encoding self.decoder = decoder self.param_predictor = L.Linear(self.transformer_size, 6) params = self.param_predictor.b.data # params[...] = 0 # params[[0, 4]] = 0.8 # self.param_predictor.W.data[...] = 0 self.param_embedder = L.Linear(6, self.transformer_size) self.mask = subsequent_mask(self.transformer_size)
def make_std_mask(target: Tensor, pad) -> Tensor: """ Create a mask for `target` hiding both the padding (specified by `pad`) and the subsequent words (prevent token at position i to attend to positions > i). :param target: Tensor to create a mask for. :param pad: token corresponding to padding elements. :return: Mask hiding both padding and subsequent elements in target. """ # hide padding target_mask = (target != pad).unsqueeze(-2) # hide padding and future words target_mask = (target_mask & subsequent_mask(target.shape[-1]).type_as(target_mask.data)) return target_mask
def test_forward(self): # Parameters batch_size = 64 sequence_length = 10 d_k = d_v = d_model = input_size = 512 d_ff = 2048 nb_of_decoder_layers = 6 # Initialize decoder decoder_layer = DecoderLayer( size=input_size, self_attn=MultiHeadAttention(n_head=8, d_model=d_model, d_k=d_k, d_v=d_v, dropout=0.1), memory_attn=MultiHeadAttention(n_head=8, d_model=d_model, d_k=d_k, d_v=d_v, dropout=0.1), feed_forward=PositionwiseFeedForward(d_model=d_model, d_ff=d_ff, dropout=0.1), dropout=0.1) decoder = Decoder(layer=decoder_layer, N=nb_of_decoder_layers) # Initialize input and memory x = torch.ones((batch_size, sequence_length, input_size)) memory = torch.ones((batch_size, sequence_length, input_size)) # Subsequent mask: mask all words with length > i decoder_mask = subsequent_mask(sequence_length) # Forward pass with fool input and memory (same here) out = decoder.forward(x, memory, decoder_mask, None) # Unit Tests self.assertIsInstance(out, torch.Tensor) self.assertEqual(out.shape, x.shape) self.assertEqual(out.shape, memory.shape) self.assertEqual(x.shape, memory.shape) self.assertEqual(torch.isnan(out).sum(), 0)
def forward(self, tgt_seq, tgt_pos, src_seq, enc_outputs): slf_attn_mask = torch.gt( padding_mask(tgt_seq, tgt_seq) + subsequent_mask(tgt_seq), 0) inter_attn_mask = padding_mask(tgt_seq, src_seq) embedded = self.emb(tgt_seq) embedded += self.pos_enc(tgt_pos) outputs, slf_attns, inter_attns = [], [], [] output = embedded for layer, enc_output in zip(self.layers, enc_outputs): output, slf_attn, inter_attn = layer( output, enc_output, slf_attn_mask=slf_attn_mask, inter_attn_mask=inter_attn_mask) outputs += [output] slf_attns += [slf_attn] inter_attns += [inter_attn] return outputs, slf_attns, inter_attns
def __init__(self, flist, modules, consts, options): self.batch_size = len(flist) self.x = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64) self.x_ext = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64) self.px = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64) self.pxs = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64) self.y = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64) self.y_ext = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64) self.y_inp = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64) self.py = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64) self.pys = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64) self.x_mask = np.zeros((self.batch_size, 1, consts["len_x"]), dtype = np.int64) self.y_mask = np.zeros((self.batch_size, 1, consts["len_y"]), dtype = np.int64) self.y_mask_tri = np.zeros((self.batch_size, consts["len_y"], consts["len_y"]), dtype = np.int64) self.len_x = [] self.len_y = [] self.original_contents = [] self.original_summarys = [] self.x_ext_words = [] self.max_ext_len = 0 w2i = modules["w2i"] i2w = modules["i2w"] dict_size = len(w2i) for idx_doc in xrange(len(flist)): if len(flist[idx_doc]) == 2: contents, summarys = flist[idx_doc] else: print "ERROR!" return content, original_content = contents summary, original_summary = summarys self.original_contents.append(original_content) self.original_summarys.append(original_summary) xi_oovs = [] send_id = 1 num_word = 0 for idx_word in xrange(len(content)): # some sentences in duc is longer than len_x if idx_word == consts["len_x"]: break w = content[idx_word] num_word += 1 if idx_word > 0 and content[idx_word - 1] == "." and num_word >= 10: send_id += 1 num_word = 1 if w not in w2i: # OOV if w not in xi_oovs: xi_oovs.append(w) self.x_ext[idx_doc, idx_word] = dict_size + xi_oovs.index(w) # 500005, 51000 w = i2w[modules["lfw_emb"]] else: self.x_ext[idx_doc, idx_word] = w2i[w] self.x[idx_doc, idx_word] = w2i[w] self.x_mask[idx_doc, 0, idx_word] = 1 self.px[idx_doc, idx_word] = idx_word + 1#num_word self.pxs[idx_doc, idx_word] = send_id self.len_x.append(np.sum(self.x_mask[idx_doc, :, :])) self.x_ext_words.append(xi_oovs) if self.max_ext_len < len(xi_oovs): self.max_ext_len = len(xi_oovs) if options["has_y"]: send_id = 1 num_word = 0 for idx_word in xrange(len(summary)): w = summary[idx_word] num_word += 1 if idx_word > 0 and summary[idx_word - 1] == "." and num_word >= 10: send_id += 1 num_word = 1 if w not in w2i: if w in xi_oovs: self.y_ext[idx_doc, idx_word] = dict_size + xi_oovs.index(w) else: self.y_ext[idx_doc, idx_word] = w2i[i2w[modules["lfw_emb"]]] # unk w = i2w[modules["lfw_emb"]] else: self.y_ext[idx_doc, idx_word] = w2i[w] self.y[idx_doc, idx_word] = w2i[w] if (idx_word + 1) < len(summary): self.y_inp[idx_doc, idx_word + 1] = w2i[w] # teacher forcing self.py[idx_doc, idx_word] = idx_word #num_word # 1st:0 self.pys[idx_doc, idx_word] = send_id if not options["is_predicting"]: self.y_mask[idx_doc, 0, idx_word] = 1 len_summ = len(summary) self.len_y.append(len_summ) self.y_mask_tri[idx_doc,:len_summ, :len_summ] = subsequent_mask(len_summ) else: self.y = self.y_mask = self.y_mask_tri=None max_len_x = int(np.max(self.len_x)) max_len_y = int(np.max(self.len_y)) self.x = self.x[:, 0:max_len_x] self.x_ext = self.x_ext[:, 0:max_len_x] self.x_mask = self.x_mask[:, :, 0:max_len_x] self.px = self.px[:, 0:max_len_x] self.pxs = self.pxs[:, 0:max_len_x] self.y = self.y[:, 0:max_len_y] self.y_ext = self.y_ext[:, 0:max_len_y] self.y_inp = self.y_inp[:, 0:max_len_y] self.y_mask = self.y_mask[:, :, 0:max_len_y] self.y_mask_tri = self.y_mask_tri[:, 0:max_len_y, 0:max_len_y] self.py = self.py[:, 0:max_len_y] self.pys = self.pys[:, 0:max_len_y]
def greedy_decode(self, src: torch.Tensor, src_mask: torch.Tensor, trg_vocab, start_symbol="<s>", stop_symbol="</s>", max_length=100) -> torch.Tensor: """ Returns the prediction for `src` using greedy decoding for simplicity: - Feed `src` (after embedding) in the Encoder to get the "memory", - Feed an initial tensor (filled with start_symbol) in the Decoder, with the "memory" and the appropriate corresponding mask - Get the predictions of the model, makes a max to get the next token, cat it to the previous prediction and iterate :param src: sample for which to produce predictions. :param src_mask: Associated `src` mask :param trg_vocab: Vocabulary set of the target sentences. :type trg_vocab: torchtext.vocab.Vocab :param start_symbol: Symbol used as initial value for the Decoder. Should correspond to start_token="<s>" in the dataset vocab). :param stop_symbol: Symbol used to represent an end of sentence, e.g. "</s>" (in the dataset vocab). :param max_length: Maximum sequence length of the prediction. """ # 0. Ensure inference mode self.eval() # 1. Embed src embedded = self.src_embeddings(src.type(LongTensor)) # 2. Encode embedded inputs memory = self.encoder(src=embedded, mask=src_mask) # 3. Create initial input for decoder decoder_in = torch.ones( src.shape[0], 1).type(FloatTensor) * trg_vocab.stoi[start_symbol] for i in range(max_length): # 4. Embed decoder_in decoder_in_embed = self.trg_embeddings(decoder_in.type(LongTensor)) # 5. Go through decoder out = self.decoder(x=decoder_in_embed, memory=memory, self_mask=subsequent_mask(decoder_in.shape[1]), memory_mask=src_mask) # 6. classifier: TODO: Why only last word? logits = self.classifier(out[:, -1]) # 7. Get predicted token for each sample in the batch _, next_token = logits.max(dim=1, keepdim=True) # 8. Concatenate predicted token with previous predictions decoder_in = torch.cat( [decoder_in, next_token.type(FloatTensor)], dim=1) # cast to int tensors decoder_in = decoder_in.type(IntTensor) # 9. retrieve words from tokens in the target vocab translation = "" for i in range(decoder_in.shape[1]): sym = trg_vocab.itos[decoder_in[0, i]] translation += sym + " " if sym == trg_vocab.stoi[stop_symbol]: break # 10. return prediction return translation
def make_std_mask(tgt, pad): """ Create a mask to hide padding and future words. """ tgt_mask = (tgt != pad).unsqueeze(-2) tgt_mask = tgt_mask & Variable( subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)) return tgt_mask