Exemplo n.º 1
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, char_orig_idx, sentlens, wordlens, charlens, charoffsets = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, chars, tags = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, logits, trans = self.model(word, word_mask, wordchars,
                                      wordchars_mask, tags, word_orig_idx,
                                      sentlens, wordlens, chars, charoffsets,
                                      charlens, char_orig_idx)

        # decode
        trans = trans.data.cpu().numpy()
        scores = logits.data.cpu().numpy()
        bs = logits.size(0)
        tag_seqs = []
        for i in range(bs):
            tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans)
            tags = [
                e.upper() for e in self.vocab['tag'].unmap(tags)
            ]  # uppercased tags, dirty hack to have unified NER tags, to be removed once training datasets are corrected
            tag_seqs += [tags]

        if unsort:
            tag_seqs = utils.unsort(tag_seqs, orig_idx)
        return tag_seqs
Exemplo n.º 2
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos,
                              xpos, ufeats, pretrained, lemma, head, deprel,
                              word_orig_idx, sentlens, wordlens)
        head_seqs = [
            chuliu_edmonds_one_root(adj[:l, :l])[1:]
            for adj, l in zip(preds[0], sentlens)
        ]  # remove attachment for the root
        deprel_seqs = [
            self.vocab['deprel'].unmap(
                [preds[1][i][j + 1][h] for j, h in enumerate(hs)])
            for i, hs in enumerate(head_seqs)
        ]

        pred_tokens = [[[str(head_seqs[i][j]), deprel_seqs[i][j]]
                        for j in range(sentlens[i] - 1)]
                       for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
Exemplo n.º 3
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos,
                              xpos, ufeats, pretrained, word_orig_idx,
                              sentlens, wordlens)
        upos_seqs = [
            self.vocab['upos'].unmap(sent) for sent in preds[0].tolist()
        ]
        xpos_seqs = [
            self.vocab['xpos'].unmap(sent) for sent in preds[1].tolist()
        ]
        feats_seqs = [
            self.vocab['feats'].unmap(sent) for sent in preds[2].tolist()
        ]

        pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]]
                        for j in range(sentlens[i])]
                       for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
Exemplo n.º 4
0
 def process(self, doc):
     batch = DataLoader(
         doc, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True,
         sort_during_eval=True)
     preds = []
     for i, b in enumerate(batch):
         preds += self.trainer.predict(b)
     preds = unsort(preds, batch.data_orig_idx)
     batch.conll.set(['head', 'deprel'], [y for x in preds for y in x])
Exemplo n.º 5
0
    def predict(self, batch, beam_size=1):
        inputs, orig_idx = unpack_batch(batch, self.use_cuda)
        src, src_mask, tgt, tgt_mask, pos, edits = inputs

        self.model.eval()
        batch_size = src.size(0)
        preds, edit_logits = self.model.predict(src, src_mask, pos=pos, beam_size=beam_size)
        pred_seqs = [self.vocab['char'].unmap(ids) for ids in preds] # unmap to tokens
        pred_seqs = utils.prune_decoded_seqs(pred_seqs)
        pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens
        pred_tokens = utils.unsort(pred_tokens, orig_idx)
        if self.args.get('edit', False):
            assert edit_logits is not None
            edits = np.argmax(edit_logits.data.cpu().numpy(), axis=1).reshape([batch_size]).tolist()
            edits = utils.unsort(edits, orig_idx)
        else:
            edits = None
        return pred_tokens, edits
Exemplo n.º 6
0
 def get_representation(self, chars, charoffsets, charlens, char_orig_idx):
     with torch.no_grad():
         output, _, _ = self.forward(chars, charlens)
         res = [output[i, offsets] for i, offsets in enumerate(charoffsets)]
         res = unsort(res, char_orig_idx)
         res = pack_sequence(res)
         if self.pad:
             res = pad_packed_sequence(res, batch_first=True)[0]
     return res
Exemplo n.º 7
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx = unpack_batch(batch, self.use_cuda)
        src, src_mask, tgt, tgt_mask = inputs

        self.model.eval()
        batch_size = src.size(0)
        preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
        pred_seqs = [self.vocab.unmap(ids) for ids in preds]  # unmap to tokens
        pred_seqs = utils.prune_decoded_seqs(pred_seqs)
        pred_tokens = ["".join(seq)
                       for seq in pred_seqs]  # join chars to be tokens
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
Exemplo n.º 8
0
 def process(self, doc):
     batch = DataLoader(doc,
                        self.config['batch_size'],
                        self.config,
                        self.pretrain,
                        vocab=self.vocab,
                        evaluation=True,
                        sort_during_eval=True)
     preds = []
     print("Running through {} batches".format(len(batch)))
     for i, b in enumerate(batch):
         loss = self.trainer.update(b, eval=True)
         print("In processor, loss = {}".format(loss))
         preds += self.trainer.predict(b)
     preds = unsort(preds, batch.data_orig_idx)
     batch.conll.set(['head', 'deprel'], [y for x in preds for y in x])
Exemplo n.º 9
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs

        #TODO: how to pass them with config
        kbest = 2
        k = 0

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos,
                              xpos, ufeats, pretrained, lemma, head, deprel,
                              word_orig_idx, sentlens, wordlens)
        head_seqs = []
        deprel_seqs = []
        for sentence_index, adj, sentence_length in zip(
                range(len(sentlens)), preds[0], sentlens):
            # remove attachment for the root
            scores = adj[:sentence_length, :sentence_length]
            mst = chuliu_edmonds_one_root(scores)[1:]
            #TODO: use model with kbest
            msts = [mst for _ in range(kbest)]
            head_seqs.append([])
            deprel_seqs.append([])
            for mst in msts:
                head_seqs[-1].append(mst)
                deprel_seq = self.vocab['deprel'].unmap([
                    preds[1][sentence_index][word_index + 1][parent_index]
                    for word_index, parent_index in enumerate(mst)
                ])
                deprel_seqs[-1].append(deprel_seq)

        pred_tokens = [[[str(head_seqs[i][k][j]), deprel_seqs[i][k][j]]
                        for j in range(sentlens[i] - 1)]
                       for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens