def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True): if conll: for d, (arcs, rels) in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)): if use_pos: token, pos = cell else: token, pos = cell, None sent.append( CoNLLWord(idx + 1, token, cpos=pos, head=a, deprel=self.vocabs['rel'][r])) outputs.append(sent) else: for d, (arcs, rels) in zip(data, predictions): sent = [] for idx, (a, r) in enumerate(zip(arcs, rels)): sent.append((a, self.vocabs['rel'][r])) outputs.append(sent)
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y xs = inputs ys = self.Y_to_outputs((arc_preds, rel_preds, mask)) sents = [] for x, y in zip(xs, ys): sent = CoNLLSentence() for idx, ((form, cpos), pred) in enumerate(zip(x, y)): head = [p[0] for p in pred] deprel = [p[1] for p in pred] if conll: sent.append( CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y if inputs is None: inputs = self.X_to_inputs(X) ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs) sents = [] for x, y in zip(inputs, ys): sent = CoNLLSentence() for idx, (cell, (head, deprel)) in enumerate(zip(x, y)): if self.use_pos and not self.config.get('joint_pos', None): form, cpos = cell else: form, cpos = cell, None if conll: sent.append( CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll' else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents
def _make_ptm(): raw = get_resource(PTM_V1_RAW) home = os.path.dirname(raw) done = True for part in ['train', 'dev', 'test']: if not os.path.isfile(os.path.join(home, f'{part}.conllx')): done = False break if done: return sents = [] with open(raw) as src: buffer = [] for line in src: line = line.strip() if line: buffer.append(line) else: if buffer: tok, pos, rel, arc = [x.split() for x in buffer] sent = CoNLLSentence() for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)): sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r)) sents.append(sent) buffer.clear() prev_offset = 0 # Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining # sentences are used as training data. for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]): with open(os.path.join(home, f'{part}.conllx'), 'w') as out: portion = sents[prev_offset:offset] cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}') for sent in portion: out.write(str(sent) + '\n\n') prev_offset = offset