def create_parsed_sentences(doc, separate_sentences=True): if len(doc) == 0: return [ParsedSentence('', [])] sentences = [] begin = 0 base_idx = doc[0].idx if separate_sentences: for token in doc: if token.sent_start and token.i > 0: end = token.i morphs = [ Morph( i, t.idx - base_idx, t.orth_, t.lemma_, t.pos_, t.tag_, ex_attr(t).inf, t.whitespace_, ) for i, t in enumerate(doc[begin:end]) ] for m, t in zip(morphs, doc[begin:end]): m.dep_morph = morphs[t.head.i - begin] m.dep_label = t.dep_.lower() sentences.append( ParsedSentence(doc.text[base_idx:token.idx], morphs)) begin = end base_idx = token.idx if begin < len(doc): morphs = [ Morph( i, t.idx - base_idx, t.orth_, t.lemma_, t.pos_, t.tag_, ex_attr(t).inf, t.whitespace_, ) for i, t in enumerate(doc[begin:]) ] for m, t in zip(morphs, doc[begin:]): m.dep_morph = morphs[t.head.i - begin] m.dep_label = t.dep_.lower() sentences.append(ParsedSentence(doc.text[base_idx:], morphs)) return sentences
def rewrite_with_tokens(self, rewriting_morph_index, tokens): origin = self.morphs[rewriting_morph_index] origin_pos = origin.pos t = tokens[0] origin.surface = t.orth_ origin.lemma = t.lemma_ origin.pos = t.pos_ origin.tag = t.tag_ origin.inf = ex_attr(t).inf origin.trailing_space = t.whitespace_ if origin_pos != origin.pos: origin.dep_label = '{}_as_{}'.format(origin.dep_label, origin_pos) if len(tokens) == 1: return label = 'as_{}'.format(origin.pos) others = [ Morph( rewriting_morph_index + i + 1, origin.offset + t.idx - tokens[0].idx, t.orth_, t.lemma_, t.pos_, t.tag_, ex_attr(t).inf, t.whitespace_, ) for i, t in enumerate(tokens[1:]) ] offset = origin.offset if origin.trailing_space: offset += 1 for m in others: m.offset = offset offset += len(m.surface) if m.trailing_space: offset += 1 m.dep_morph = origin m.dep_label = label for m in self.morphs[rewriting_morph_index + 1:]: m.id += len(others) self.morphs[rewriting_morph_index + 1:rewriting_morph_index + 1] = others
def to_doc(self, vocab, is_parsed=False): words = [morph.surface for morph in self.morphs] spaces = [morph.trailing_space for morph in self.morphs] doc = Doc(vocab, words=words, spaces=spaces) for token, morph in zip(doc, self.morphs): token.tag_ = morph.tag token.pos_ = morph.pos ex_attr(token).inf = morph.inf token.lemma_ = morph.lemma # work around: lemma_ must be set after tag_ (spaCy's bug) if is_parsed and morph.dep_label: token.dep_ = morph.dep_label token.head = doc[morph.dep_morph.id] return doc
def unify_range(gold_tokens, start, end, replacing_token, extend_dep_labels): dep_outer_id = None dep_outer_label = None head_pos = None for g in gold_tokens[start:end]: head_id = g['id'] + g['head'] if head_id < start or end <= head_id or g['head'] == 0: if dep_outer_id is None: dep_outer_id = head_id dep_outer_label = g['dep'] head_pos = g['pos'] elif dep_outer_id != head_id: return False if dep_outer_id is None: print(gold_tokens[start:end], file=sys.stderr) raise Exception('unexpected state') elif start < dep_outer_id < end: dep_outer_id = start g = gold_tokens[start] g['orth'] = replacing_token.orth_ g['lemma'] = replacing_token.lemma_ g['pos'] = replacing_token.pos_ g['tag'] = replacing_token.tag_ g['inf'] = ex_attr(replacing_token).inf g['whitespace'] = replacing_token.whitespace_ != '' g['head'] = dep_outer_id - start if dep_outer_label.startswith('as_'): g['dep'] = dep_outer_label else: dep = dep_outer_label.split('_as_')[0] g['dep'] = dep if not extend_dep_labels or head_pos == g[ 'pos'] else '{}_as_{}'.format(dep, head_pos) for g in gold_tokens: if g['id'] <= start and end <= g['id'] + g['head']: g['head'] -= end - start - 1 elif g['id'] <= start < g['id'] + g['head']: g['head'] = start - g['id'] elif g['id'] + g['head'] <= start and end <= g['id']: g['head'] += end - start - 1 elif g['id'] + g['head'] < end <= g['id']: g['head'] = end - g['id'] - 1 for g in gold_tokens[end:]: g['id'] -= end - start - 1 del gold_tokens[start + 1:end] return True
def unify_range(self, start, end, replacing_token): dep_outer_id = None dep_outer_label = None head = None for m in self.morphs[start:end]: if start <= m.dep_morph.id < end: if m.dep_morph.id == m.id: if dep_outer_id: return False else: dep_outer_id = m.id dep_outer_label = m.dep_label head = m elif dep_outer_id: if dep_outer_id == m.dep_morph.id: head = m else: return False else: dep_outer_id = m.dep_morph.id dep_outer_label = m.dep_label head = m if dep_outer_id is None: raise Exception('unexpected state') elif start < dep_outer_id < end: dep_outer_id = start origin = self.morphs[start] origin.surface = replacing_token.orth_ origin.lemma = replacing_token.lemma_ origin.pos = replacing_token.pos_ origin.tag = replacing_token.tag_ origin.inf = ex_attr(replacing_token).inf origin.trailing_space = replacing_token.whitespace_ origin.dep_morph = self.morphs[dep_outer_id] origin.dep_label = dep_outer_label if origin.pos == head.pos else '{}_as_{}'.format( dep_outer_label, head.pos) for m in self.morphs: if start < m.dep_morph.id < end: m.dep_morph = origin del self.morphs[start + 1:end] for m in self.morphs: if m.id >= end: m.id -= end - start - 1 return True