def run(args: Namespace): """ run function which is the start point of program Args: args: program arguments """ aligner = Aligner(args.rsc_src) funmap = open(args.unmapped, 'w', encoding='UTF-8') if args.unmapped else None for sent in sent_iter(args): if 0.0 < args.sample < 1.0 and random.random() >= args.sample: continue word_per_maps = [] for word in sent.words: try: maps = aligner.align(word) except AlignError as algn_err: if funmap: algn_err.add_msg(str(word)) print(algn_err, file=funmap) maps = [] word_per_maps.append(maps) _print_sent(sent, word_per_maps) _print_restore_dic(args) aligner.print_middle_cnt()
def run(args: Namespace): """ actual function which is doing some task Args: args: program arguments """ aligner = Aligner(args.rsc_src) restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src)) if not restore_dic: sys.exit(1) vocab_out = load_vocab_out(args.rsc_src) khaiii_api = KhaiiiApi(args.lib_path, args.rsc_dir, '{"errpatch": false}') for line_num, line in enumerate(sys.stdin, start=1): line = line.rstrip('\r\n') if not line or line[0] == '#': continue raw, left, right = line.split('\t') left_align = align_patch((aligner, restore_dic, vocab_out), raw, left) if not left_align: logging.info('invalid %d-th line: left align: %s', line_num, line) continue right_align = align_patch((aligner, restore_dic, vocab_out), raw, right) if not right_align: logging.info('invalid %d-th line: right align: %s', line_num, line) continue if len(left_align) != len(right_align): logging.info('invalid %d-th line: left/right diff: %s', line_num, line) continue pos_cnt = 0 neg_cnt = 0 for sent in _sent_iter(args): pos_cnt_sent, neg_cnt_sent = _cnt_pos_neg( khaiii_api, raw, (left_align, right_align), (aligner, restore_dic, vocab_out), sent) pos_cnt += pos_cnt_sent neg_cnt += neg_cnt_sent if neg_cnt > 0: break if neg_cnt > 0 or pos_cnt == 0: logging.info('invalid %d-th line: +%d, -%d: %s', line_num, pos_cnt, neg_cnt, line) continue print('{}\t{}\t{}'.format(raw, left, right))
def run(args: Namespace): """ run function which is the start point of program Args: args: program arguments """ aligner = Aligner(args.rsc_src) restore_dic = load_restore_dic('{}/restore.dic'.format(args.rsc_src)) if not restore_dic: sys.exit(1) vocab_out = load_vocab_out(args.rsc_src) entries = _load_entries(args) if not entries: logging.error('no entry to compile') sys.exit(2) _check_dup(entries) entries = [e for e in entries if not e.is_sharp] # 주석 처리한 엔트리는 제외 _set_align((aligner, restore_dic, vocab_out), entries) _save_trie(args.rsc_dir, entries)
def run(args: Namespace): """ run function which is the start point of program Args: args: program arguments """ aligner = Aligner(args.rsc_src) restore_dic = parse_restore_dic('{}/restore.dic'.format(args.rsc_src)) if not restore_dic: sys.exit(1) restore_new = defaultdict(dict) vocab_out = load_vocab_out(args.rsc_src) vocab_new = {} entries = _load_entries(args) _check_dup(entries) entries = [e for e in entries if not e.is_sharp] # 주석 처리한 엔트리는 제외 _set_align(aligner, sejong_corpus.Word, entries) _set_tag_out(restore_dic, restore_new, vocab_out, vocab_new, entries) append_new_entries(args.rsc_src, restore_new, vocab_new) _save_trie(args.rsc_dir, entries)
def _set_align(aligner: Aligner, Word: type, entries: List[Entry]): # pylint: disable=invalid-name """ 음절과 형태소 분석 결과를 정렬한다. Args: aligner: Aligner 객체 Word: Word 타입 entries: 엔트리 리스트 """ bad_entries = [] for entry in entries: if entry.is_sharp: continue word = Word.parse( '\t'.join(['', entry.word, Morph.to_str(entry.morphs)]), '', 0) try: entry.align = aligner.align(word) except AlignError as map_exc: entry.err_msg = 'fail to align' logging.error(map_exc) bad_entries.append(entry) print_errors(bad_entries)