def test_contains_cycle(): tree = [1, 2, 2, 4, 5, 2, 2] cyclic_tree = [1, 2, 2, 4, 5, 3, 2] partial_tree = [1, 2, 2, 4, 5, None, 2] multirooted_tree = [3, 2, 0, 3, 3, 7, 7, 3, 7, 10, 7, 10, 11, 12, 18, 16, 18, 17, 12, 3] assert contains_cycle(tree) == None assert contains_cycle(cyclic_tree) == set([3, 4, 5]) assert contains_cycle(partial_tree) == None assert contains_cycle(multirooted_tree) == None
def test_contains_cycle(): tree = [1, 2, 2, 4, 5, 2, 2] cyclic_tree = [1, 2, 2, 4, 5, 3, 2] partial_tree = [1, 2, 2, 4, 5, None, 2] multirooted_tree = [ 3, 2, 0, 3, 3, 7, 7, 3, 7, 10, 7, 10, 11, 12, 18, 16, 18, 17, 12, 3 ] assert (contains_cycle(tree) == None) assert (contains_cycle(cyclic_tree) == set([3, 4, 5])) assert (contains_cycle(partial_tree) == None) assert (contains_cycle(multirooted_tree) == None)
def convert_lines(path, lines, tokenizer, paragraph_id_regex, n_sents): paragraphs = [] raw = '' sentences = [] paragraph_id = None sentence_id = None sentence = '' tokens = [] ents = [] ent_start_char = None ent_label = None offset = 0 state = 'sid' def error_line(_state, _path, _line_index, _sentence_id, _sentence, _line): print('Illegal format: state={}, file={} ({}), sent_id={}, {}'.format( _state, _path, _line_index + 1, _sentence_id, _sentence ), file=sys.stderr) print(_line, file=sys.stderr) raise ValueError for line_index, line in enumerate(lines): line = line.rstrip() if state == 'sid': m = SID_PATTERN.match(line) if m is None: error_line(state, path, line_index, sentence_id, sentence, line) return [] sentence_id = m.group(1) m = re.match(paragraph_id_regex, sentence_id) if m: new_paragraph_id = m.group(1) else: new_paragraph_id = '' if paragraph_id is None or paragraph_id != new_paragraph_id: paragraph_id = new_paragraph_id if sentences: paragraphs.append({ 'raw': raw, 'sentences': sentences, }) raw = '' sentences = [] state = 'text' elif state == 'text': m = TEXT_PATTERN.match(line) if m is None: error_line(state, path, line_index, sentence_id, sentence, line) return [] sentence = m.group(1) raw += sentence state = 'ios' elif state == 'ios' and line != '': m = TOKEN_PATTERN.match(line) if m is None: error_line(state, path, line_index, sentence_id, sentence, line) return [] token_id = int(m.group(1)) - 1 orth = m.group(2) lemma = m.group(3) pos = m.group(4) tag = m.group(5) head_id = int(m.group(7)) - 1 if head_id < 0: head_id = token_id dep = m.group(8) options = m.group(10) whitespace = options.find('SpaceAfter=No') < 0 tokens.append({ 'id': token_id, 'orth': orth, 'lemma': lemma, 'pos': pos, 'tag': tag, 'dep': dep, 'head': head_id - token_id, 'whitespace': whitespace, }) m = re.search(r'NE=([^|]+)', options) if m: label = m.group(1) if label.startswith('B-'): if ent_label: ents.append({ 'start': ent_start_char, 'end': offset, 'label': ent_label, }) ent_start_char = offset ent_label = label[2:] elif not label.startswith('I-') or not ent_label: raise Exception('Bad NE label: ' + line) elif ent_label: ents.append({ 'start': ent_start_char, 'end': offset, 'label': ent_label, }) ent_start_char = None ent_label = None offset += len(orth) if whitespace: offset += 1 elif state == 'ios' and line == '': if len(tokens) == 0: error_line(state, path, line_index, sentence_id, sentence, line) return [] if ent_label: ents.append({ 'start': ent_start_char, 'end': offset, 'label': ent_label, }) heads = [t['id'] + t['head'] for t in tokens] if is_nonproj_tree(heads): print(file=sys.stderr) print('skip(non-projective):', path, sentence_id, file=sys.stderr) elif contains_cycle(heads): print(file=sys.stderr) print('skip(cyclic)', path, sentence_id, file=sys.stderr) else: if tokenizer: retokenize(tokens, tokenizer( ''.join([t['orth'] + (' ' if t['whitespace'] else '') for t in tokens]) )) offset = 0 ent_label = None ent_end = 0 ent_queue = [] for t in tokens: end = offset + len(t['orth']) if t['whitespace']: end += 1 if ent_end > 0: if offset < ent_end: ent_queue.append(t) offset = end continue if end >= ent_end: if len(ent_queue) == 1: ent_queue[0]['ner'] = 'U-' + ent_label else: ent_queue[0]['ner'] = 'B-' + ent_label for et in ent_queue[1:-1]: et['ner'] = 'I-' + ent_label ent_queue[-1]['ner'] = 'L-' + ent_label ent_label = None ent_end = 0 ent_queue.clear() for ent in ents: if ent['start'] < end and offset < ent['end']: ent_label = ent['label'] ent_end = ent['end'] ent_queue.append(t) break offset = end if ent_end > 0: if len(ent_queue) == 1: ent_queue[0]['ner'] = 'U-' + ent_label else: ent_queue[0]['ner'] = 'B-' + ent_label for et in ent_queue[1:-1]: et['ner'] = 'I-' + ent_label ent_queue[-1]['ner'] = 'L-' + ent_label for t in tokens: if 'ner' not in t: t['ner'] = 'O' sentences.append({'tokens': tokens}) if len(sentences) >= n_sents: paragraphs.append({ 'raw': raw, 'sentences': sentences, }) raw = '' sentences = [] sentence_id = None sentence = "" tokens = [] ents = [] ent_start_char = None ent_label = None offset = 0 state = 'sid' else: error_line(state, path, line_index, sentence_id, sentence, line) return [] if state != 'sid': error_line(state, path, len(lines), sentence_id, sentence, '<END OF FILE>') return [] if sentences: paragraphs.append({ 'raw': raw, 'sentences': sentences, }) return paragraphs
def retokenize(gold_tokens, doc, debug=False): if debug: print(doc.text) print([g['orth'] + (' ' if g['whitespace'] else '') for g in gold_tokens]) print([t.orth_ + t.whitespace_ for t in doc]) length = len(doc.text) index_g = 0 g_offset = 0 index_t = 0 t_offset = 0 align_from_g = None align_from_t = None last_aligned_g = 0 last_aligned_t = 0 while g_offset < length and t_offset < length: g = gold_tokens[index_g] g_end = g_offset + len(g['orth']) if g['whitespace']: g_end += 1 t = doc[index_t] t_end = t_offset + len(t.orth_) if t.whitespace_: t_end += 1 if debug: print(index_g, g_offset, g_end, g['orth'], align_from_g, index_t, t_offset, t_end, t.orth_, align_from_t) if g_end == t_end: if align_from_t is not None: if debug: _print('>', gold_tokens[index_g:index_g + 1], doc[align_from_t:index_t + 1]) rewrite_with_tokens(gold_tokens, index_g, doc[align_from_t:index_t + 1]) index_g += index_t - align_from_t align_from_t = None elif align_from_g is not None: if debug: _print('<', gold_tokens[align_from_g:index_g + 1], doc[index_t:index_t + 1]) if unify_range(gold_tokens, align_from_g, index_g + 1, doc[index_t]): index_g = align_from_g align_from_g = None elif g_offset == t_offset: if debug: tag = g['tag'] == t.tag_ _print( '==' if tag else '=', gold_tokens[index_g:index_g + 1], doc[index_t:index_t + 1] ) rewrite_with_tokens(gold_tokens, index_g, doc[index_t:index_t + 1]) else: if debug: _print('!', gold_tokens[last_aligned_g:index_g + 1], doc[last_aligned_t:index_t + 1]) index_g += 1 g_offset = g_end last_aligned_g = index_g index_t += 1 t_offset = t_end last_aligned_t = index_t elif g_end > t_end: if g_offset == t_offset: align_from_t = index_t if align_from_g is not None: align_from_g = None index_t += 1 t_offset = t_end else: if g_offset == t_offset: align_from_g = index_g if align_from_t is not None: align_from_t = None index_g += 1 g_offset = g_end if last_aligned_g != len(gold_tokens) or g_offset != length or t_offset != length: raise Exception( 'Unexpected state: len(gold_tokens)={},last_aligned_g={},len(gold)={},g_offset={},t_offset={}'.format( len(gold_tokens), last_aligned_g, length, g_offset, t_offset, ) ) for g in gold_tokens: if g['head'] != 0 and g['tag'].endswith('可能') and g['dep'].find('as_') == -1: g['dep'] = '{}_as_{}'.format(g['dep'], g['pos']) heads = [g['id'] + g['head'] for g in gold_tokens] if is_nonproj_tree(heads): print(list(enumerate(heads)), file=sys.stderr) for t in gold_tokens: print(t, file=sys.stderr) raise Exception('non-projective') elif contains_cycle(heads): print(list(enumerate(heads)), file=sys.stderr) for t in gold_tokens: print(t, file=sys.stderr) raise Exception('cyclic')
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): assert contains_cycle(tree) is None assert contains_cycle(cyclic_tree) == set([3, 4, 5]) assert contains_cycle(partial_tree) is None assert contains_cycle(multirooted_tree) is None
def convert_lines( path, lines, tokenizer, paragraph_id_regex, n_sents, extend_dep_labels, ensure_end_period, luw_ent, _print_bunsetu_dep=False, ): paragraphs = [] raw = '' sentences = [] paragraph_id = None sentence_id = None sentence = '' tokens = [] bunsetu_head_deps = {} bunsetu_all_deps = {} bunsetu_begin = None bunsetu_root = None bunsetu_head = None bunsetu_heads = None bunsetu_dep = None ent_target = False ents = [] ent_start_char = None ent_end_char = None ent_label = None skip = False offset = 0 state = 'sid' def error_line(_state, _path, _line_index, _sentence_id, _sentence, _line): print('Illegal format: state={}, file={} ({}), sent_id={}, {}'.format( _state, _path, _line_index + 1, _sentence_id, _sentence ), file=sys.stderr) print(_line, file=sys.stderr) raise ValueError for line_index, line in enumerate(lines): line = line.rstrip() if state == 'sid': m = SID_PATTERN.match(line) if m is None: m = NEW_DOC_ID_PATTERN.match(line) if m is not None: continue error_line(state, path, line_index, sentence_id, sentence, line) return [] sentence_id = m.group(1) m = re.match(paragraph_id_regex, sentence_id) if m: new_paragraph_id = m.group(1) else: new_paragraph_id = '' if paragraph_id is None or paragraph_id != new_paragraph_id: paragraph_id = new_paragraph_id if sentences: paragraphs.append({ 'raw': raw, 'sentences': sentences, }) raw = '' sentences = [] state = 'text' elif state == 'text': m = TEXT_PATTERN.match(line) if m is None: error_line(state, path, line_index, sentence_id, sentence, line) return [] sentence = m.group(1) state = 'ios' elif state == 'ios' and line != '': m = TOKEN_PATTERN.match(line) if m is None: m = TEXT_EN_PATTERN.match(line) if m is not None: continue error_line(state, path, line_index, sentence_id, sentence, line) return [] token_id = int(m.group(1)) - 1 orth = m.group(2) lemma = m.group(3) pos = m.group(4) tag = m.group(5) head_id = int(m.group(7)) - 1 if head_id < 0: head_id = token_id dep = m.group(8) options = m.group(10) whitespace = options.find('SpaceAfter=No') < 0 tokens.append({ 'id': token_id, 'orth': orth, 'lemma': lemma, 'pos': pos, 'tag': tag, 'dep': dep, 'head': head_id - token_id, 'whitespace': whitespace, 'ner': 'O', }) m = BUNSETU_PATTERN.search(options) if m.group(1) == "B": if bunsetu_dep: for h, d in bunsetu_heads: assert bunsetu_begin <= h < token_id or h == bunsetu_head, str(bunsetu_heads) + line if extend_dep_labels and bunsetu_dep.lower() != 'root': tokens[bunsetu_root]['dep'] += '_bunsetu' if bunsetu_dep not in bunsetu_head_deps: bunsetu_head_deps[bunsetu_dep] = 0 bunsetu_head_deps[bunsetu_dep] += 1 bunsetu_begin = token_id bunsetu_root = token_id bunsetu_head = head_id bunsetu_heads = [] bunsetu_dep = dep bunsetu_heads.append((head_id, dep)) elif head_id < bunsetu_begin or token_id <= bunsetu_head < head_id or dep.lower() == "root": bunsetu_root = token_id bunsetu_head = head_id bunsetu_dep = dep bunsetu_heads.append((head_id, dep)) if bunsetu_dep not in bunsetu_all_deps: bunsetu_all_deps[bunsetu_dep] = 0 bunsetu_all_deps[bunsetu_dep] += 1 if luw_ent: m = LUW_PATTERN.search(options) else: m = NE_PATTERN.search(options) if m: ent_target = True if luw_ent: label = m.group(1) + "-" + m.group(2) else: label = m.group(1) if label[0] == "U": label = "B" + label[1:] elif label[0] == "L": label = "I" + label[1:] if label.startswith('B'): if ent_label: ents.append({ 'start': ent_start_char, 'end': ent_end_char, 'label': ent_label, }) ent_start_char = offset ent_end_char = offset + len(orth) ent_label = label[2:] elif label.startswith('I'): if not ent_label or ent_label != label[2:]: print('inconsistent ENT label: ' + str(ent_label) + ', ' + line, file=sys.stderr) skip = True else: ent_end_char = offset + len(orth) elif not luw_ent and label == "O": if ent_label: ents.append({ 'start': ent_start_char, 'end': ent_end_char, 'label': ent_label, }) ent_start_char = None ent_end_char = None ent_label = None else: print('bad ENT label: ' + line, file=sys.stderr) skip = True ent_start_char = None ent_end_char = None ent_label = None elif luw_ent: print('missing LUW label: ' + line, file=sys.stderr) skip = True elif ent_label: ents.append({ 'start': ent_start_char, 'end': ent_end_char, 'label': ent_label, }) ent_start_char = None ent_end_char = None ent_label = None offset += len(orth) if whitespace: offset += 1 elif state == 'ios' and line == '': if len(tokens) == 0: error_line(state, path, line_index, sentence_id, sentence, line) return [] if ent_label: ents.append({ 'start': ent_start_char, 'end': ent_end_char, 'label': ent_label, }) if bunsetu_dep: if extend_dep_labels and bunsetu_dep.lower() != 'root': tokens[bunsetu_root]['dep'] += '_bunsetu' if bunsetu_dep not in bunsetu_head_deps: bunsetu_head_deps[bunsetu_dep] = 0 bunsetu_head_deps[bunsetu_dep] += 1 heads = [t['id'] + t['head'] for t in tokens] if is_nonproj_tree(heads): print(file=sys.stderr) print('skip(non-projective):', path, sentence_id, file=sys.stderr) elif contains_cycle(heads): print(file=sys.stderr) print('skip(cyclic)', path, sentence_id, file=sys.stderr) elif skip: print(file=sys.stderr) print('skip(bad-luw-label)', path, sentence_id, file=sys.stderr) else: if tokenizer: retokenize_gold( tokens, tokenizer( ''.join([t['orth'] + (' ' if t['whitespace'] else '') for t in tokens]) ), ) if ent_target: offset = 0 ent_label = None ent_end = 0 ent_queue = [] for t in tokens: end = offset + len(t['orth']) if t['whitespace']: end += 1 if ent_end > 0: if offset < ent_end: ent_queue.append(t) offset = end continue if end >= ent_end: if len(ent_queue) == 1: ent_queue[0]['ner'] = 'U-' + ent_label else: ent_queue[0]['ner'] = 'B-' + ent_label for et in ent_queue[1:-1]: et['ner'] = 'I-' + ent_label ent_queue[-1]['ner'] = 'L-' + ent_label ent_label = None ent_end = 0 ent_queue.clear() for ent in ents: if ent['start'] < end and offset < ent['end']: ent_label = ent['label'] ent_end = ent['end'] ent_queue.append(t) break offset = end if ent_end > 0: if len(ent_queue) == 1: ent_queue[0]['ner'] = 'U-' + ent_label else: ent_queue[0]['ner'] = 'B-' + ent_label for et in ent_queue[1:-1]: et['ner'] = 'I-' + ent_label ent_queue[-1]['ner'] = 'L-' + ent_label raw += sentence sentences.append({'tokens': tokens}) if len(sentences) >= n_sents and (not ensure_end_period or tokens[-1]['orth'] == '。'): paragraphs.append({ 'raw': raw, 'sentences': sentences, }) raw = '' sentences = [] sentence_id = None sentence = "" tokens = [] bunsetu_begin = None bunsetu_head = None bunsetu_dep = None ent_target = False ents = [] ent_start_char = None ent_end_char = None ent_label = None skip = False offset = 0 state = 'sid' else: error_line(state, path, line_index, sentence_id, sentence, line) return [] if state != 'sid': error_line(state, path, len(lines), sentence_id, sentence, '<END OF FILE>') return [] if sentences: if not ensure_end_period or sentences[-1]['tokens'][-1]['orth'] == '。': paragraphs.append({ 'raw': raw, 'sentences': sentences, }) else: paragraph = paragraphs[-1] paragraphs[-1] = { 'raw': raw + paragraphs['raw'], 'sentences': sentences + paragraph['sentences'], } if _print_bunsetu_dep: for dep, count in sorted(bunsetu_head_deps.items()): print("bunsetu_dep:", dep, count, bunsetu_all_deps[dep], sep='\t') return paragraphs