Пример #1
0
                    text += ' '
                text += part.lstrip()
            meta['text'] = text

        multi_tokens = []
        space_idx = 0
        for tok_idx, tok in enumerate(sent):
            id_, form, misc = tok['ID'], tok['FORM'], tok['MISC']
            if TOKEN in form and ('-' in id_ or form != TOKEN):
                raise ValueError('ERROR: Already edited?')

            if form == TOKEN:
                if tok_idx and not start_spaces[space_idx]:
                    next_tok = sent[tok_idx + 1]
                    tok['MISC']['SpaceAfter'] = 'No'
                    multi_token = Conllu.from_sentence(
                        [form + next_tok['FORM']])[0]
                    multi_token['ID'] = '{}-{}'.format(id_, next_tok['ID'])
                    multi_token['MISC'] = deepcopy(next_tok['MISC'])
                    multi_tokens.append((tok_idx, multi_token))
                space_idx += 1

        for idx, tok in reversed(multi_tokens):
            sent.insert(idx, tok)

        start_spaces = start_spaces[len(parts) - 1:]

    path = str(Path(fn).absolute())
    path = path.replace(CONLL_DIR, EDITED_DIR)
    path = Path(path)
    if not path.parent.exists():
        path.parent.mkdir()
Пример #2
0
                text += TOKEN
            text += parts[-1]
            meta['text'] = text

        multi_tokens = []
        space_idx = 0
        for tok_idx, tok in enumerate(sent):
            id_, form, misc = tok['ID'], tok['FORM'], tok['MISC']
            if TOKEN in form and ('-' in id_ or form != TOKEN):
                raise ValueError('ERROR: Already edited?')

            if form == TOKEN:
                if tok_idx and not end_spaces[space_idx]:
                    prev_tok = sent[tok_idx - 1]
                    prev_tok['MISC']['SpaceAfter'] = 'No'
                    multi_token = Conllu.from_sentence(
                        [prev_tok['FORM'] + form])[0]
                    multi_token['ID'] = '{}-{}'.format(prev_tok['ID'], id_)
                    multi_token['MISC'] = deepcopy(misc)
                    multi_tokens.append((tok_idx - 1, multi_token))
                space_idx += 1

        for idx, tok in reversed(multi_tokens):
            sent.insert(idx, tok)

        end_spaces = end_spaces[len(parts) - 1:]

    path = str(Path(fn).absolute())
    path = path.replace(CONLL_DIR, EDITED_DIR)
    path = Path(path)
    if not path.parent.exists():
        path.parent.mkdir()
Пример #3
0
            f'{ENDING_TPL}{HYPHEN}', f'(?:{re_end_})?{HYPHEN}').replace(
                ENDING_TPL, f'(?:{re_end_})?').replace(HYPHEN, rf'${HYPHEN}^')
          + '$').split(HYPHEN), y) for x, y in tpl
    ]


re_cons_ = get_re(conjoints)
re_diss_ = get_re(disjoints)
#print(re_cons_)
#print(re_diss_)
rex = [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), False)
           for x in re_cons_] \
    + [(re.compile(x[0][0]), re.compile(x[0][1]), len(x[1]), True)
           for x in re_diss_]

hyphen_tok = Conllu.from_sentence(['-'])[0]
hyphen_tok['MISC']['SpaceAfter'] = 'Yes'

for fn in glob.glob(CONLL_DIR + '/*/*.txt', recursive=True):
    print(fn)
    corpus = list(Conllu.load(fn))

    end_spaces = []
    for sentence in corpus:
        sent, meta = sentence

        sub_tokens = []
        multi_end_id = None
        for tok_idx, tok in enumerate(sent):
            id_, form, misc = tok['ID'], tok['FORM'], tok['MISC']
            if '-' in id_: