def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, char_orig_idx, sentlens, wordlens, charlens, charoffsets = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, chars, tags = inputs self.model.eval() batch_size = word.size(0) _, logits, trans = self.model(word, word_mask, wordchars, wordchars_mask, tags, word_orig_idx, sentlens, wordlens, chars, charoffsets, charlens, char_orig_idx) # decode trans = trans.data.cpu().numpy() scores = logits.data.cpu().numpy() bs = logits.size(0) tag_seqs = [] for i in range(bs): tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans) tags = [ e.upper() for e in self.vocab['tag'].unmap(tags) ] # uppercased tags, dirty hack to have unified NER tags, to be removed once training datasets are corrected tag_seqs += [tags] if unsort: tag_seqs = utils.unsort(tag_seqs, orig_idx) return tag_seqs
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, word_orig_idx, sentlens, wordlens) head_seqs = [ chuliu_edmonds_one_root(adj[:l, :l])[1:] for adj, l in zip(preds[0], sentlens) ] # remove attachment for the root deprel_seqs = [ self.vocab['deprel'].unmap( [preds[1][i][j + 1][h] for j, h in enumerate(hs)]) for i, hs in enumerate(head_seqs) ] pred_tokens = [[[str(head_seqs[i][j]), deprel_seqs[i][j]] for j in range(sentlens[i] - 1)] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens) upos_seqs = [ self.vocab['upos'].unmap(sent) for sent in preds[0].tolist() ] xpos_seqs = [ self.vocab['xpos'].unmap(sent) for sent in preds[1].tolist() ] feats_seqs = [ self.vocab['feats'].unmap(sent) for sent in preds[2].tolist() ] pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def process(self, doc): batch = DataLoader( doc, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = unsort(preds, batch.data_orig_idx) batch.conll.set(['head', 'deprel'], [y for x in preds for y in x])
def predict(self, batch, beam_size=1): inputs, orig_idx = unpack_batch(batch, self.use_cuda) src, src_mask, tgt, tgt_mask, pos, edits = inputs self.model.eval() batch_size = src.size(0) preds, edit_logits = self.model.predict(src, src_mask, pos=pos, beam_size=beam_size) pred_seqs = [self.vocab['char'].unmap(ids) for ids in preds] # unmap to tokens pred_seqs = utils.prune_decoded_seqs(pred_seqs) pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens pred_tokens = utils.unsort(pred_tokens, orig_idx) if self.args.get('edit', False): assert edit_logits is not None edits = np.argmax(edit_logits.data.cpu().numpy(), axis=1).reshape([batch_size]).tolist() edits = utils.unsort(edits, orig_idx) else: edits = None return pred_tokens, edits
def get_representation(self, chars, charoffsets, charlens, char_orig_idx): with torch.no_grad(): output, _, _ = self.forward(chars, charlens) res = [output[i, offsets] for i, offsets in enumerate(charoffsets)] res = unsort(res, char_orig_idx) res = pack_sequence(res) if self.pad: res = pad_packed_sequence(res, batch_first=True)[0] return res
def predict(self, batch, unsort=True): inputs, orig_idx = unpack_batch(batch, self.use_cuda) src, src_mask, tgt, tgt_mask = inputs self.model.eval() batch_size = src.size(0) preds, _ = self.model.predict(src, src_mask, self.args['beam_size']) pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens pred_seqs = utils.prune_decoded_seqs(pred_seqs) pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def process(self, doc): batch = DataLoader(doc, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] print("Running through {} batches".format(len(batch))) for i, b in enumerate(batch): loss = self.trainer.update(b, eval=True) print("In processor, loss = {}".format(loss)) preds += self.trainer.predict(b) preds = unsort(preds, batch.data_orig_idx) batch.conll.set(['head', 'deprel'], [y for x in preds for y in x])
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs #TODO: how to pass them with config kbest = 2 k = 0 self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, word_orig_idx, sentlens, wordlens) head_seqs = [] deprel_seqs = [] for sentence_index, adj, sentence_length in zip( range(len(sentlens)), preds[0], sentlens): # remove attachment for the root scores = adj[:sentence_length, :sentence_length] mst = chuliu_edmonds_one_root(scores)[1:] #TODO: use model with kbest msts = [mst for _ in range(kbest)] head_seqs.append([]) deprel_seqs.append([]) for mst in msts: head_seqs[-1].append(mst) deprel_seq = self.vocab['deprel'].unmap([ preds[1][sentence_index][word_index + 1][parent_index] for word_index, parent_index in enumerate(mst) ]) deprel_seqs[-1].append(deprel_seq) pred_tokens = [[[str(head_seqs[i][k][j]), deprel_seqs[i][k][j]] for j in range(sentlens[i] - 1)] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens