Exemplo n.º 1
0
def _set_align(rsc_src: Tuple[Aligner, dict, Dict[str, int]],
               entries: List[Entry]):
    """
    음절과 형태소 분석 결과를 정렬한다.
    Args:
        rsc_src:  (Aligner, restore dic, vocab out) resource triple
        Word:  Word 타입
        entries:  엔트리 리스트
    """
    bad_entries = []
    for entry in entries:
        if entry.is_sharp:
            continue
        entry.left_align = align_patch(rsc_src, entry.raw,
                                       Morph.to_str(entry.left))
        if not entry.left_align:
            entry.err_msg = 'fail to align left'
            bad_entries.append(entry)
            continue
        entry.right_align = align_patch(rsc_src, entry.raw,
                                        Morph.to_str(entry.right))
        if not entry.right_align:
            entry.err_msg = 'fail to align right'
            bad_entries.append(entry)
            continue
        assert len(entry.left_align) == len(entry.right_align)
    print_errors(bad_entries)
Exemplo n.º 2
0
def run(args: Namespace):
    """
    actual function which is doing some task
    Args:
        args:  program arguments
    """
    aligner = Aligner(args.rsc_src)
    restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src))
    if not restore_dic:
        sys.exit(1)
    vocab_out = load_vocab_out(args.rsc_src)

    khaiii_api = KhaiiiApi(args.lib_path, args.rsc_dir, '{"errpatch": false}')

    for line_num, line in enumerate(sys.stdin, start=1):
        line = line.rstrip('\r\n')
        if not line or line[0] == '#':
            continue
        raw, left, right = line.split('\t')
        left_align = align_patch((aligner, restore_dic, vocab_out), raw, left)
        if not left_align:
            logging.info('invalid %d-th line: left align: %s', line_num, line)
            continue
        right_align = align_patch((aligner, restore_dic, vocab_out), raw,
                                  right)
        if not right_align:
            logging.info('invalid %d-th line: right align: %s', line_num, line)
            continue
        if len(left_align) != len(right_align):
            logging.info('invalid %d-th line: left/right diff: %s', line_num,
                         line)
            continue
        pos_cnt = 0
        neg_cnt = 0
        for sent in _sent_iter(args):
            pos_cnt_sent, neg_cnt_sent = _cnt_pos_neg(
                khaiii_api, raw, (left_align, right_align),
                (aligner, restore_dic, vocab_out), sent)
            pos_cnt += pos_cnt_sent
            neg_cnt += neg_cnt_sent
            if neg_cnt > 0:
                break
        if neg_cnt > 0 or pos_cnt == 0:
            logging.info('invalid %d-th line: +%d, -%d: %s', line_num, pos_cnt,
                         neg_cnt, line)
            continue
        print('{}\t{}\t{}'.format(raw, left, right))