def _count_error(args: Namespace, doc_path: str) -> Counter: """ count from courpus and make ambiguous dictionary Args: args: program arguments doc_path: document path Returns: 오분석 패치 후보(원문, 오분석, 정분석 triple)의 카운터 """ global _KHAIII # pylint: disable=global-statement if not _KHAIII: _KHAIII = KhaiiiApi(args.lib_path, args.rsc_dir) cnt = Counter() logging.info(doc_path) for sent in sejong_corpus.sents(open(doc_path, 'r', encoding='UTF-8')): raw_sent = sent.raw_str() result_morphs = [ SENT_DELIM_STR, ] corpus_morphs = [ SENT_DELIM_STR, ] alignments = [ (0, 0), ] for result_word, corpus_word in zip(_KHAIII.analyze(raw_sent, ''), sent.words): assert result_word.lex == corpus_word.raw, \ '{}: "{}" != "{}"'.format(os.path.basename(doc_path), result_word, corpus_word) if len(result_morphs) > 1: result_morphs.append(WORD_DELIM_STR) corpus_morphs.append(WORD_DELIM_STR) alignments.append((alignments[-1][1], alignments[-1][1] + 1)) result_morphs.extend( ['{}/{}'.format(m.lex, m.tag) for m in result_word.morphs]) corpus_morphs.extend([str(m) for m in corpus_word.morphs]) alignments.extend([(m.begin, m.begin + m.length) for m in result_word.morphs]) result_morphs.append(SENT_DELIM_STR) corpus_morphs.append(SENT_DELIM_STR) alignments.append((alignments[-1][1], alignments[-1][1])) if result_morphs != corpus_morphs: diff_sgmts = _get_diff_sgmts(result_morphs, corpus_morphs, raw_sent, alignments) for origin, left, right in diff_sgmts: cnt[origin, ' + '.join(left), ' + '.join(right)] += 1 return cnt
def _analyze_sent(khaiii_api: KhaiiiApi, raw_sent: str) -> List[int]: """ 원시 문장에 대해 패치를 적용하지 않은 음절별 태깅 결과를 얻는다. Args: khaiii_api: khaiii API 객체 raw_sent: 원시 문장 Returns: list of output tag numbers """ tag_nums = khaiii_api.analyze_bfr_errpatch(raw_sent, '') logging.debug(tag_nums) return tag_nums
def run(args: Namespace): """ actual function which is doing some task Args: args: program arguments """ aligner = Aligner(args.rsc_src) restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src)) if not restore_dic: sys.exit(1) vocab_out = load_vocab_out(args.rsc_src) khaiii_api = KhaiiiApi(args.lib_path, args.rsc_dir, '{"errpatch": false}') for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not line or line[0] == '#': continue raw, left, right = line.split('\t') left_align = align_patch((aligner, restore_dic, vocab_out), raw, left) if not left_align: logging.info('invalid %d-th line: left align: %s', line_num, line) continue right_align = align_patch((aligner, restore_dic, vocab_out), raw, right) if not right_align: logging.info('invalid %d-th line: right align: %s', line_num, line) continue if len(left_align) != len(right_align): logging.info('invalid %d-th line: left/right diff: %s', line_num, line) continue pos_cnt = 0 neg_cnt = 0 for sent in _sent_iter(args): pos_cnt_sent, neg_cnt_sent = _cnt_pos_neg( khaiii_api, raw, (left_align, right_align), (aligner, restore_dic, vocab_out), sent) pos_cnt += pos_cnt_sent neg_cnt += neg_cnt_sent if neg_cnt > 0: break if neg_cnt > 0 or pos_cnt == 0: logging.info('invalid %d-th line: +%d, -%d: %s', line_num, pos_cnt, neg_cnt, line) continue print('{}\t{}\t{}'.format(raw, left, right))
def _cnt_pos_neg(khaiii_api: KhaiiiApi, patch_raw: str, alignment: Tuple[list, list], rsc_src: Tuple[Aligner, dict, Dict[str, str]], sent: Sentence) -> Tuple[int, int]: """ 오분석을 정분석으로 바꾼 횟수와, 오분석을 다른 오분석으로 바꾼 횟수를 센다. Args: khaiii_api: khaiii API object patch_raw: raw part of patch alignment: (left, right) alignment pair rsc_src: (Aligner, restore dic, vocab out) resource triple sent: Sentence object Returns: 오분석 -> 정분석 횟수 오분석 -> 오분석 횟수 """ raw_sent = sent.raw_str() if patch_raw not in raw_sent: # 원문이 문장에서 발견되지 않으면 스킵 return 0, 0 aligner, restore_dic, vocab_out = rsc_src sent_align = _align_sent((aligner, restore_dic, vocab_out), sent) if not sent_align: # 코퍼스 정답이 원문과 정렬이 되지 않고 오류가 발생하면 스킵 return 0, 0 left_align, right_align = alignment left_needle = mix_char_tag(patch_raw, left_align) sent_anal = khaiii_api.analyze_bfr_errpatch(raw_sent, '') sent_haystack = mix_char_tag(raw_sent, sent_anal) pos_cnt = 0 neg_cnt = 0 found = _find_list(sent_haystack, left_needle) while found >= 0: # 패치의 좌측 오분석 열이 분석 결과에서 나타난 경우 우측 정답 열과 코퍼스를 비교 right_corpus = sent_align[found:found + len(left_needle)] if right_align == right_corpus: pos_cnt += 1 else: neg_cnt += 1 del sent_haystack[:found + len(left_needle)] found = _find_list(sent_haystack, left_needle) return pos_cnt, neg_cnt