Пример #1
0
 def predictions_to_human(self,
                          predictions,
                          outputs,
                          data,
                          use_pos,
                          conll=True):
     if conll:
         for d, (arcs, rels) in zip(data, predictions):
             sent = CoNLLSentence()
             for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)):
                 if use_pos:
                     token, pos = cell
                 else:
                     token, pos = cell, None
                 sent.append(
                     CoNLLWord(idx + 1,
                               token,
                               cpos=pos,
                               head=a,
                               deprel=self.vocabs['rel'][r]))
             outputs.append(sent)
     else:
         for d, (arcs, rels) in zip(data, predictions):
             sent = []
             for idx, (a, r) in enumerate(zip(arcs, rels)):
                 sent.append((a, self.vocabs['rel'][r]))
             outputs.append(sent)
Пример #2
0
 def XY_to_inputs_outputs(self,
                          X: Union[tf.Tensor, Tuple[tf.Tensor]],
                          Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                          gold=False,
                          inputs=None,
                          conll=True) -> Iterable:
     (words, feats, mask), (arc_preds, rel_preds) = X, Y
     xs = inputs
     ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
     sents = []
     for x, y in zip(xs, ys):
         sent = CoNLLSentence()
         for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
             head = [p[0] for p in pred]
             deprel = [p[1] for p in pred]
             if conll:
                 sent.append(
                     CoNLLWord(id=idx + 1,
                               form=form,
                               cpos=cpos,
                               head=head,
                               deprel=deprel))
             else:
                 sent.append([head, deprel])
         sents.append(sent)
     return sents
Пример #3
0
 def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                          gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable:
     (words, feats, mask), (arc_preds, rel_preds) = X, Y
     if inputs is None:
         inputs = self.X_to_inputs(X)
     ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs)
     sents = []
     for x, y in zip(inputs, ys):
         sent = CoNLLSentence()
         for idx, (cell, (head, deprel)) in enumerate(zip(x, y)):
             if self.use_pos and not self.config.get('joint_pos', None):
                 form, cpos = cell
             else:
                 form, cpos = cell, None
             if conll:
                 sent.append(
                     CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll'
                     else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel))
             else:
                 sent.append([head, deprel])
         sents.append(sent)
     return sents
Пример #4
0
def _make_ptm():
    raw = get_resource(PTM_V1_RAW)
    home = os.path.dirname(raw)
    done = True
    for part in ['train', 'dev', 'test']:
        if not os.path.isfile(os.path.join(home, f'{part}.conllx')):
            done = False
            break
    if done:
        return
    sents = []
    with open(raw) as src:
        buffer = []
        for line in src:
            line = line.strip()
            if line:
                buffer.append(line)
            else:
                if buffer:
                    tok, pos, rel, arc = [x.split() for x in buffer]
                    sent = CoNLLSentence()
                    for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)):
                        sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r))
                    sents.append(sent)
                    buffer.clear()

    prev_offset = 0
    # Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining
    # sentences are used as training data.
    for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]):
        with open(os.path.join(home, f'{part}.conllx'), 'w') as out:
            portion = sents[prev_offset:offset]
            cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}')
            for sent in portion:
                out.write(str(sent) + '\n\n')
        prev_offset = offset