def validate(model, epoch, val_iterator, src_vocab, tgt_vocab, args, writer):
    model.eval()

    losses = 0
    correct_words = 0
    total_words = 0

    with torch.no_grad():
        for batch_idx, batch in tqdm(enumerate(val_iterator),
                                     total=len(val_iterator)):
            device = args.device
            src = batch.src.transpose(0, 1).to(device)
            tgt = batch.tgt.transpose(0, 1).to(device)
            src_mask = padding_mask(src, src_vocab)
            tgt_mask = padding_mask(tgt[:, :-1], src_vocab) & subsequent_mask(
                tgt[:, :-1]).to(device)

            out = model(src, tgt[:, :-1], src_mask, tgt_mask)
            labels = tgt[:, 1:].contiguous().view(-1)
            loss, n_correct = cal_performance(out, labels, tgt_vocab)

            losses += loss.item()
            total_words += tgt[:, 1:].ne(
                tgt_vocab.stoi[CONSTANTS['pad']]).sum().item()
            correct_words += n_correct

    print('(Validation) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %'.format(
        ppl=math.exp(losses / total_words),
        accu=100 * correct_words / total_words))
    writer.add_scalar('val_loss', losses / total_words, epoch)
def train(model, epoch, train_iterator, optimizer, src_vocab, tgt_vocab, args,
          writer):
    model.train()

    losses = 0
    correct_words = 0
    total_words = 0

    for batch_idx, batch in tqdm(enumerate(train_iterator),
                                 total=len(train_iterator)):
        device = args.device
        src = batch.src.transpose(0, 1).to(device)
        tgt = batch.tgt.transpose(0, 1).to(device)
        src_mask = padding_mask(src, src_vocab)
        tgt_mask = padding_mask(tgt[:, :-1], tgt_vocab) & subsequent_mask(
            tgt[:, :-1]).to(device)

        out = model(src, tgt[:, :-1], src_mask, tgt_mask)
        optimizer.zero_grad()

        labels = tgt[:, 1:].contiguous().view(-1)
        loss, n_correct = cal_performance(out, labels, tgt_vocab)
        loss.backward()
        optimizer.step()

        losses += loss.item()
        total_words += tgt[:, 1:].ne(
            tgt_vocab.stoi[CONSTANTS['pad']]).sum().item()
        correct_words += n_correct

    print('(Training) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f} %'.format(
        ppl=math.exp(losses / total_words),
        accu=100 * correct_words / total_words))
    writer.add_scalar('train_loss', losses / total_words, epoch)
    return correct_words / total_words
Exemplo n.º 3
0
def test(model, test_iterator, src_vocab, tgt_vocab, args, writer):
    model.eval()

    losses = 0
    correct_words = 0
    total_words = 0

    references = []
    hypotheses = []
    with torch.no_grad():
        for batch_idx, batch in tqdm(enumerate(test_iterator),
                                     total=len(test_iterator)):
            device = args.device
            src = batch.src.transpose(0, 1).to(device)
            tgt = batch.tgt.transpose(0, 1).to(device)
            src_mask = padding_mask(src, src_vocab)
            tgt_mask = padding_mask(tgt[:, :-1], src_vocab) & subsequent_mask(
                tgt[:, :-1]).to(device)

            out = model(src, tgt[:, :-1], src_mask, tgt_mask)
            labels = tgt[:, 1:].contiguous().view(-1)
            loss, n_correct = cal_performance(out, labels, tgt_vocab)

            losses += loss.item()
            total_words += tgt[:, 1:].ne(
                tgt_vocab.stoi[CONSTANTS['pad']]).sum().item()
            correct_words += n_correct

            # Convert target sentence into words
            for idxs in tgt.tolist():
                references.append([[
                    idx for idx in idxs
                    if idx != tgt_vocab.stoi[CONSTANTS['start']]
                    and idx != tgt_vocab.stoi[CONSTANTS['pad']]
                ]])

            # Convert prediction into a sentence
            word_idxs = torch.max(out, dim=-1)[1]
            for idxs in word_idxs.tolist():
                hypotheses.append([
                    idx for idx in idxs
                    if idx != tgt_vocab.stoi[CONSTANTS['start']]
                    and idx != tgt_vocab.stoi[CONSTANTS['pad']]
                ])

    bleu_1 = corpus_bleu(references, hypotheses, weights=(1, 0, 0, 0))
    bleu_2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0, 0))
    print(
        '(Test) ppl: {ppl: 8.5f}, accuracy: {accu:3.3f}%, BLEU-1: {bleu1:3.3f}, BLEU-2: {bleu2:3.3f}'
        .format(ppl=math.exp(losses / total_words),
                accu=100 * correct_words / total_words,
                bleu1=bleu_1,
                bleu2=bleu_2))
Exemplo n.º 4
0
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len - 1):
        out = model.decode(
            memory, src_mask, Variable(ys),
            Variable(subsequent_mask(ys.size(1)).type_as(src.data)))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys
Exemplo n.º 5
0
    def __init__(self, num_chars, num_words, num_classes, bos_token, **kwargs):
        super().__init__()
        self.transformer_size = kwargs.pop('transformer_size', 512)
        self.bos_token = bos_token

        with self.init_scope():
            self.feature_extractor = ResNet(kwargs.pop('num_layers', 18))
            self.transformer = get_conv_feature_encoder_decoder(
                num_classes, N=1, model_size=self.transformer_size)
            self.classifier = L.Linear(self.transformer_size, num_classes)
            self.mask = subsequent_mask(self.transformer_size)

        self.num_chars = num_chars
        self.num_words = num_words
        chainer.global_config.user_text_recognition_grayscale_input = False
    def predict_word(self, dec_seq, src_seq, enc_output, n_active_inst):
        src_mask = padding_mask(src_seq, self.src_vocab)
        tgt_mask = padding_mask(
            dec_seq, self.tgt_vocab) & subsequent_mask(dec_seq).to(self.device)

        dec_seq = self.model.tgt_embedding(dec_seq) * math.sqrt(
            self.model.d_model)
        dec_seq = self.model.positional_encoder2(dec_seq)

        dec_output = self.model.decoder(dec_seq, enc_output, src_mask,
                                        tgt_mask)
        dec_output = dec_output[:, -1, :]
        word_prob = F.log_softmax(self.model.linear(dec_output), dim=1)
        word_prob = word_prob.view(n_active_inst, self.beam_size, -1)

        return word_prob
Exemplo n.º 7
0
    def __init__(self, vocab_size, max_len, start_symbol, transformer_size=512):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.start_symbol = start_symbol
        self.transformer_size = transformer_size

        with self.init_scope():
            model = get_encoder_decoder(
                vocab_size,
                vocab_size,
                N=2,
                model_size=transformer_size,
            )
            self.model = model
            self.mask = subsequent_mask(self.transformer_size)
            self.classifier = L.Linear(transformer_size, vocab_size)
    def __init__(self, *args, **kwargs):
        self.transformer_size = kwargs.pop('transformer_size', 512)

        super().__init__(*args, **kwargs)

        with self.init_scope():
            positional_encoding, decoder = build_transform_param_decoder(N=1, model_size=self.transformer_size)
            self.positional_encoding = positional_encoding
            self.decoder = decoder
            self.param_predictor = L.Linear(self.transformer_size, 6)

            params = self.param_predictor.b.data
            # params[...] = 0
            # params[[0, 4]] = 0.8
            # self.param_predictor.W.data[...] = 0

            self.param_embedder = L.Linear(6, self.transformer_size)
            self.mask = subsequent_mask(self.transformer_size)
Exemplo n.º 9
0
    def make_std_mask(target: Tensor, pad) -> Tensor:
        """
        Create a mask for `target` hiding both the padding (specified by `pad`) and the subsequent words
        (prevent token at position i to attend to positions > i).

        :param target: Tensor to create a mask for.

        :param pad: token corresponding to padding elements.

        :return: Mask hiding both padding and subsequent elements in target.
        """
        # hide padding
        target_mask = (target != pad).unsqueeze(-2)

        # hide padding and future words
        target_mask = (target_mask & subsequent_mask(target.shape[-1]).type_as(target_mask.data))

        return target_mask
Exemplo n.º 10
0
    def test_forward(self):
        # Parameters
        batch_size = 64
        sequence_length = 10
        d_k = d_v = d_model = input_size = 512
        d_ff = 2048
        nb_of_decoder_layers = 6

        # Initialize decoder
        decoder_layer = DecoderLayer(
            size=input_size,
            self_attn=MultiHeadAttention(n_head=8,
                                         d_model=d_model,
                                         d_k=d_k,
                                         d_v=d_v,
                                         dropout=0.1),
            memory_attn=MultiHeadAttention(n_head=8,
                                           d_model=d_model,
                                           d_k=d_k,
                                           d_v=d_v,
                                           dropout=0.1),
            feed_forward=PositionwiseFeedForward(d_model=d_model,
                                                 d_ff=d_ff,
                                                 dropout=0.1),
            dropout=0.1)

        decoder = Decoder(layer=decoder_layer, N=nb_of_decoder_layers)

        # Initialize input and memory
        x = torch.ones((batch_size, sequence_length, input_size))
        memory = torch.ones((batch_size, sequence_length, input_size))

        # Subsequent mask: mask all words with length > i
        decoder_mask = subsequent_mask(sequence_length)

        # Forward pass with fool input and memory (same here)
        out = decoder.forward(x, memory, decoder_mask, None)

        # Unit Tests
        self.assertIsInstance(out, torch.Tensor)
        self.assertEqual(out.shape, x.shape)
        self.assertEqual(out.shape, memory.shape)
        self.assertEqual(x.shape, memory.shape)
        self.assertEqual(torch.isnan(out).sum(), 0)
Exemplo n.º 11
0
    def forward(self, tgt_seq, tgt_pos, src_seq, enc_outputs):
        slf_attn_mask = torch.gt(
            padding_mask(tgt_seq, tgt_seq) + subsequent_mask(tgt_seq), 0)
        inter_attn_mask = padding_mask(tgt_seq, src_seq)

        embedded = self.emb(tgt_seq)
        embedded += self.pos_enc(tgt_pos)

        outputs, slf_attns, inter_attns = [], [], []
        output = embedded
        for layer, enc_output in zip(self.layers, enc_outputs):
            output, slf_attn, inter_attn = layer(
                output,
                enc_output,
                slf_attn_mask=slf_attn_mask,
                inter_attn_mask=inter_attn_mask)
            outputs += [output]
            slf_attns += [slf_attn]
            inter_attns += [inter_attn]

        return outputs, slf_attns, inter_attns
Exemplo n.º 12
0
    def __init__(self, flist, modules, consts, options):
        self.batch_size = len(flist) 
        self.x = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64)
        self.x_ext = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64)
        self.px = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64)
        self.pxs = np.zeros((self.batch_size, consts["len_x"]), dtype = np.int64)
        self.y = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64)
        self.y_ext = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64)
        self.y_inp = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64)
        self.py = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64)
        self.pys = np.zeros((self.batch_size, consts["len_y"]), dtype = np.int64)
        self.x_mask = np.zeros((self.batch_size, 1, consts["len_x"]), dtype = np.int64)
        self.y_mask = np.zeros((self.batch_size, 1, consts["len_y"]), dtype = np.int64)
        self.y_mask_tri = np.zeros((self.batch_size, consts["len_y"], consts["len_y"]), dtype = np.int64)
        self.len_x = []
        self.len_y = []
        self.original_contents = []
        self.original_summarys = []
        self.x_ext_words = []
        self.max_ext_len = 0

        w2i = modules["w2i"]
        i2w = modules["i2w"]
        dict_size = len(w2i)

        for idx_doc in xrange(len(flist)):
            if len(flist[idx_doc]) == 2:
                contents, summarys = flist[idx_doc]
            else:
                print "ERROR!"
                return
            
            content, original_content = contents
            summary, original_summary = summarys
            self.original_contents.append(original_content)
            self.original_summarys.append(original_summary)
            xi_oovs = []

            send_id = 1
            num_word = 0
            for idx_word in xrange(len(content)):
                    # some sentences in duc is longer than len_x
                    if idx_word == consts["len_x"]:
                        break
                    w = content[idx_word]
                    
                    num_word += 1
                    if idx_word > 0 and content[idx_word - 1] == "." and num_word >= 10:
                        send_id += 1
                        num_word = 1
            
                    if w not in w2i: # OOV
                        if w not in xi_oovs:
                            xi_oovs.append(w)
                        self.x_ext[idx_doc, idx_word] = dict_size + xi_oovs.index(w) # 500005, 51000
                        w = i2w[modules["lfw_emb"]]
                    else:
                        self.x_ext[idx_doc, idx_word] = w2i[w]
                    
                    self.x[idx_doc, idx_word] = w2i[w]
                    self.x_mask[idx_doc, 0, idx_word] = 1
                    self.px[idx_doc, idx_word] = idx_word + 1#num_word
                    self.pxs[idx_doc, idx_word] = send_id

            self.len_x.append(np.sum(self.x_mask[idx_doc, :, :]))
            self.x_ext_words.append(xi_oovs)
            if self.max_ext_len < len(xi_oovs):
                self.max_ext_len = len(xi_oovs)

            if options["has_y"]:
                send_id = 1 
                num_word = 0  
                for idx_word in xrange(len(summary)):
                    w = summary[idx_word]
                    
                    num_word += 1
                    if idx_word > 0 and summary[idx_word - 1] == "." and num_word >= 10:
                        send_id += 1
                        num_word = 1

                    if w not in w2i:
                        if w in xi_oovs:
                            self.y_ext[idx_doc, idx_word] = dict_size + xi_oovs.index(w)
                        else:
                            self.y_ext[idx_doc, idx_word] = w2i[i2w[modules["lfw_emb"]]] # unk
                        w = i2w[modules["lfw_emb"]] 
                    else:
                        self.y_ext[idx_doc, idx_word] =  w2i[w]
                    self.y[idx_doc, idx_word] = w2i[w]
                    if (idx_word + 1) < len(summary):
                        self.y_inp[idx_doc, idx_word + 1] = w2i[w] # teacher forcing
                    self.py[idx_doc, idx_word] = idx_word #num_word # 1st:0 
                    self.pys[idx_doc, idx_word] = send_id

                    if not options["is_predicting"]:
                        self.y_mask[idx_doc, 0, idx_word] = 1
                len_summ = len(summary)
                self.len_y.append(len_summ)
                self.y_mask_tri[idx_doc,:len_summ, :len_summ] = subsequent_mask(len_summ)
            else:
                self.y = self.y_mask = self.y_mask_tri=None

        max_len_x = int(np.max(self.len_x))
        max_len_y = int(np.max(self.len_y))
        
        self.x = self.x[:, 0:max_len_x]
        self.x_ext = self.x_ext[:, 0:max_len_x]
        self.x_mask = self.x_mask[:, :, 0:max_len_x]
        self.px = self.px[:, 0:max_len_x]
        self.pxs = self.pxs[:, 0:max_len_x]
        self.y = self.y[:, 0:max_len_y]
        self.y_ext = self.y_ext[:, 0:max_len_y]
        self.y_inp = self.y_inp[:, 0:max_len_y]
        self.y_mask = self.y_mask[:, :, 0:max_len_y]
        self.y_mask_tri = self.y_mask_tri[:, 0:max_len_y, 0:max_len_y]
        self.py = self.py[:, 0:max_len_y]
        self.pys = self.pys[:, 0:max_len_y]
Exemplo n.º 13
0
    def greedy_decode(self,
                      src: torch.Tensor,
                      src_mask: torch.Tensor,
                      trg_vocab,
                      start_symbol="<s>",
                      stop_symbol="</s>",
                      max_length=100) -> torch.Tensor:
        """
        Returns the prediction for `src` using greedy decoding for simplicity:

            - Feed `src` (after embedding) in the Encoder to get the "memory",
            - Feed an initial tensor (filled with start_symbol) in the Decoder, with the "memory" and the appropriate corresponding mask
            - Get the predictions of the model, makes a max to get the next token, cat it to the previous prediction and iterate


        :param src: sample for which to produce predictions.

        :param src_mask: Associated `src` mask

        :param trg_vocab: Vocabulary set of the target sentences.
        :type trg_vocab: torchtext.vocab.Vocab

        :param start_symbol: Symbol used as initial value for the Decoder. Should correspond to start_token="<s>" in the dataset vocab).

        :param stop_symbol: Symbol used to represent an end of sentence, e.g. "</s>" (in the dataset vocab).

        :param max_length: Maximum sequence length of the prediction.

        """
        # 0. Ensure inference mode
        self.eval()

        # 1. Embed src
        embedded = self.src_embeddings(src.type(LongTensor))

        # 2. Encode embedded inputs
        memory = self.encoder(src=embedded, mask=src_mask)

        # 3. Create initial input for decoder
        decoder_in = torch.ones(
            src.shape[0], 1).type(FloatTensor) * trg_vocab.stoi[start_symbol]

        for i in range(max_length):

            # 4. Embed decoder_in
            decoder_in_embed = self.trg_embeddings(decoder_in.type(LongTensor))

            # 5. Go through decoder
            out = self.decoder(x=decoder_in_embed,
                               memory=memory,
                               self_mask=subsequent_mask(decoder_in.shape[1]),
                               memory_mask=src_mask)

            # 6. classifier: TODO: Why only last word?
            logits = self.classifier(out[:, -1])

            # 7. Get predicted token for each sample in the batch
            _, next_token = logits.max(dim=1, keepdim=True)

            # 8. Concatenate predicted token with previous predictions
            decoder_in = torch.cat(
                [decoder_in, next_token.type(FloatTensor)], dim=1)

        # cast to int tensors
        decoder_in = decoder_in.type(IntTensor)
        # 9. retrieve words from tokens in the target vocab
        translation = ""
        for i in range(decoder_in.shape[1]):
            sym = trg_vocab.itos[decoder_in[0, i]]
            translation += sym + " "

            if sym == trg_vocab.stoi[stop_symbol]:
                break

        # 10. return prediction
        return translation
Exemplo n.º 14
0
 def make_std_mask(tgt, pad):
     """ Create a mask to hide padding and future words. """
     tgt_mask = (tgt != pad).unsqueeze(-2)
     tgt_mask = tgt_mask & Variable(
         subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
     return tgt_mask