Пример #1
0
def _count_error(args: Namespace, doc_path: str) -> Counter:
    """
    count from courpus and make ambiguous dictionary
    Args:
        args:  program arguments
        doc_path:  document path
    Returns:
        오분석 패치 후보(원문, 오분석, 정분석 triple)의 카운터
    """
    global _KHAIII  # pylint: disable=global-statement
    if not _KHAIII:
        _KHAIII = KhaiiiApi(args.lib_path, args.rsc_dir)

    cnt = Counter()
    logging.info(doc_path)
    for sent in sejong_corpus.sents(open(doc_path, 'r', encoding='UTF-8')):
        raw_sent = sent.raw_str()
        result_morphs = [
            SENT_DELIM_STR,
        ]
        corpus_morphs = [
            SENT_DELIM_STR,
        ]
        alignments = [
            (0, 0),
        ]
        for result_word, corpus_word in zip(_KHAIII.analyze(raw_sent, ''),
                                            sent.words):
            assert result_word.lex == corpus_word.raw, \
                   '{}: "{}" != "{}"'.format(os.path.basename(doc_path), result_word, corpus_word)
            if len(result_morphs) > 1:
                result_morphs.append(WORD_DELIM_STR)
                corpus_morphs.append(WORD_DELIM_STR)
                alignments.append((alignments[-1][1], alignments[-1][1] + 1))
            result_morphs.extend(
                ['{}/{}'.format(m.lex, m.tag) for m in result_word.morphs])
            corpus_morphs.extend([str(m) for m in corpus_word.morphs])
            alignments.extend([(m.begin, m.begin + m.length)
                               for m in result_word.morphs])
        result_morphs.append(SENT_DELIM_STR)
        corpus_morphs.append(SENT_DELIM_STR)
        alignments.append((alignments[-1][1], alignments[-1][1]))
        if result_morphs != corpus_morphs:
            diff_sgmts = _get_diff_sgmts(result_morphs, corpus_morphs,
                                         raw_sent, alignments)
            for origin, left, right in diff_sgmts:
                cnt[origin, ' + '.join(left), ' + '.join(right)] += 1
    return cnt
Пример #2
0
def _analyze_sent(khaiii_api: KhaiiiApi, raw_sent: str) -> List[int]:
    """
    원시 문장에 대해 패치를 적용하지 않은 음절별 태깅 결과를 얻는다.
    Args:
        khaiii_api:  khaiii API 객체
        raw_sent:  원시 문장
    Returns:
        list of output tag numbers
    """
    tag_nums = khaiii_api.analyze_bfr_errpatch(raw_sent, '')
    logging.debug(tag_nums)
    return tag_nums
Пример #3
0
def run(args: Namespace):
    """
    actual function which is doing some task
    Args:
        args:  program arguments
    """
    aligner = Aligner(args.rsc_src)
    restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
    if not restore_dic:
        sys.exit(1)
    vocab_out = load_vocab_out(args.rsc_src)

    khaiii_api = KhaiiiApi(args.lib_path, args.rsc_dir, '{"errpatch": false}')

    for line_num, line in enumerate(sys.stdin, start=1):
        line = line.rstrip('\r\n')
        if not line or line[0] == '#':
            continue
        raw, left, right = line.split('\t')
        left_align = align_patch((aligner, restore_dic, vocab_out), raw, left)
        if not left_align:
            logging.info('invalid %d-th line: left align: %s', line_num, line)
            continue
        right_align = align_patch((aligner, restore_dic, vocab_out), raw,
                                  right)
        if not right_align:
            logging.info('invalid %d-th line: right align: %s', line_num, line)
            continue
        if len(left_align) != len(right_align):
            logging.info('invalid %d-th line: left/right diff: %s', line_num,
                         line)
            continue
        pos_cnt = 0
        neg_cnt = 0
        for sent in _sent_iter(args):
            pos_cnt_sent, neg_cnt_sent = _cnt_pos_neg(
                khaiii_api, raw, (left_align, right_align),
                (aligner, restore_dic, vocab_out), sent)
            pos_cnt += pos_cnt_sent
            neg_cnt += neg_cnt_sent
            if neg_cnt > 0:
                break
        if neg_cnt > 0 or pos_cnt == 0:
            logging.info('invalid %d-th line: +%d, -%d: %s', line_num, pos_cnt,
                         neg_cnt, line)
            continue
        print('{}\t{}\t{}'.format(raw, left, right))
Пример #4
0
def _cnt_pos_neg(khaiii_api: KhaiiiApi, patch_raw: str,
                 alignment: Tuple[list, list], rsc_src: Tuple[Aligner, dict,
                                                              Dict[str, str]],
                 sent: Sentence) -> Tuple[int, int]:
    """
    오분석을 정분석으로 바꾼 횟수와, 오분석을 다른 오분석으로 바꾼 횟수를 센다.
    Args:
        khaiii_api:  khaiii API object
        patch_raw:  raw part of patch
        alignment:  (left, right) alignment pair
        rsc_src:  (Aligner, restore dic, vocab out) resource triple
        sent:  Sentence object
    Returns:
        오분석 -> 정분석 횟수
        오분석 -> 오분석 횟수
    """
    raw_sent = sent.raw_str()
    if patch_raw not in raw_sent:
        # 원문이 문장에서 발견되지 않으면 스킵
        return 0, 0
    aligner, restore_dic, vocab_out = rsc_src
    sent_align = _align_sent((aligner, restore_dic, vocab_out), sent)
    if not sent_align:
        # 코퍼스 정답이 원문과 정렬이 되지 않고 오류가 발생하면 스킵
        return 0, 0
    left_align, right_align = alignment
    left_needle = mix_char_tag(patch_raw, left_align)
    sent_anal = khaiii_api.analyze_bfr_errpatch(raw_sent, '')
    sent_haystack = mix_char_tag(raw_sent, sent_anal)
    pos_cnt = 0
    neg_cnt = 0
    found = _find_list(sent_haystack, left_needle)
    while found >= 0:
        # 패치의 좌측 오분석 열이 분석 결과에서 나타난 경우 우측 정답 열과 코퍼스를 비교
        right_corpus = sent_align[found:found + len(left_needle)]
        if right_align == right_corpus:
            pos_cnt += 1
        else:
            neg_cnt += 1
        del sent_haystack[:found + len(left_needle)]
        found = _find_list(sent_haystack, left_needle)
    return pos_cnt, neg_cnt