text += ' ' text += part.lstrip() meta['text'] = text multi_tokens = [] space_idx = 0 for tok_idx, tok in enumerate(sent): id_, form, misc = tok['ID'], tok['FORM'], tok['MISC'] if TOKEN in form and ('-' in id_ or form != TOKEN): raise ValueError('ERROR: Already edited?') if form == TOKEN: if tok_idx and not start_spaces[space_idx]: next_tok = sent[tok_idx + 1] tok['MISC']['SpaceAfter'] = 'No' multi_token = Conllu.from_sentence( [form + next_tok['FORM']])[0] multi_token['ID'] = '{}-{}'.format(id_, next_tok['ID']) multi_token['MISC'] = deepcopy(next_tok['MISC']) multi_tokens.append((tok_idx, multi_token)) space_idx += 1 for idx, tok in reversed(multi_tokens): sent.insert(idx, tok) start_spaces = start_spaces[len(parts) - 1:] path = str(Path(fn).absolute()) path = path.replace(CONLL_DIR, EDITED_DIR) path = Path(path) if not path.parent.exists(): path.parent.mkdir()
text += TOKEN text += parts[-1] meta['text'] = text multi_tokens = [] space_idx = 0 for tok_idx, tok in enumerate(sent): id_, form, misc = tok['ID'], tok['FORM'], tok['MISC'] if TOKEN in form and ('-' in id_ or form != TOKEN): raise ValueError('ERROR: Already edited?') if form == TOKEN: if tok_idx and not end_spaces[space_idx]: prev_tok = sent[tok_idx - 1] prev_tok['MISC']['SpaceAfter'] = 'No' multi_token = Conllu.from_sentence( [prev_tok['FORM'] + form])[0] multi_token['ID'] = '{}-{}'.format(prev_tok['ID'], id_) multi_token['MISC'] = deepcopy(misc) multi_tokens.append((tok_idx - 1, multi_token)) space_idx += 1 for idx, tok in reversed(multi_tokens): sent.insert(idx, tok) end_spaces = end_spaces[len(parts) - 1:] path = str(Path(fn).absolute()) path = path.replace(CONLL_DIR, EDITED_DIR) path = Path(path) if not path.parent.exists(): path.parent.mkdir()
f'{ENDING_TPL}{HYPHEN}', f'(?:{re_end_})?{HYPHEN}').replace( ENDING_TPL, f'(?:{re_end_})?').replace(HYPHEN, rf'${HYPHEN}^') + '$').split(HYPHEN), y) for x, y in tpl ] re_cons_ = get_re(conjoints) re_diss_ = get_re(disjoints) #print(re_cons_) #print(re_diss_) rex = [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), False) for x in re_cons_] \ + [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), True) for x in re_diss_] hyphen_tok = Conllu.from_sentence(['-'])[0] hyphen_tok['MISC']['SpaceAfter'] = 'Yes' for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True): print(fn) corpus = list(Conllu.load(fn)) end_spaces = [] for sentence in corpus: sent, meta = sentence sub_tokens = [] multi_end_id = None for tok_idx, tok in enumerate(sent): id_, form, misc = tok['ID'], tok['FORM'], tok['MISC'] if '-' in id_: