def trans_symbol(symbol,prev_term,next_term): if symbol in count_symbols: return count_sym_han[count_symbols.index(symbol)] elif prev_term not in puncs: if hgtk.checker.is_hangul(prev_term) or hgtk.checker.is_hangul(next_term): return sym_han[symbols.index(symbol)] elif prev_term.isdigit() or next_term.isdigit(): return sym_han[symbols.index(symbol)] elif real_latin(prev_term) or real_latin(next_term): return sym_pro[symbols.index(symbol)] else: return '' else: return ''
def trans_number(n,prev_term,next_term): ## Context-given number reading if hgtk.checker.is_hangul(prev_term) and hgtk.checker.is_hangul(next_term): return readNumberKor(n,next_term) elif real_latin(prev_term) or real_latin(next_term): if hgtk.checker.is_hangul(next_term) and n>10: return readNumberKor(n,next_term) else: return readNumberEng(n) else: ## Maybe hanja if prev_term in symbols or next_term in symbols: return readOnlyNum(n) elif n > 99999: return readBigNum(n) else: return readNumber(n)
def leftword(chunks): for i in range(len(chunks)): eojeol = chunks[i] for j in range(len(eojeol)): term = chunks[i][j] if real_latin(term): chunks[i][j] = read_acronym(term) elif not hgtk.checker.is_hangul(term) and term not in puncs: chunks[i][j] = '' return chunks
def trans_eojeol(chunks, chunks_4num, metadata, if_num=True, if_sym=True, if_han=True, if_eng=True, if_puncs=True, if_else=True): for i in range(len(chunks)): eojeol = chunks[i] for j in range(len(eojeol)): term = eojeol[j] if term.isdigit(): if if_num: term = int(term) x, y = decide_context(term, chunks_4num, eojeol, i, j) chunks[i][j] = trans_number(term, x, y) ## Reflects context else: chunks[i][j] = term elif term in symbols + count_symbols and i + j > 0: ## Symbols not sentence-first if if_sym: x, y = decide_context(term, chunks_4num, eojeol, i, j) chunks[i][j] = trans_symbol(term, x, y) ## Currently bypassing else: chunks[i][j] = term elif hgtk.checker.is_hanja(term): if if_han: chunks[i][j] = trans_hanja(term) ## Double check else: chunks[i][j] = term elif real_latin(term): if if_eng: chunks[i][j] = trans_latin( term) ## Transliteration (or bypassing) else: chunks[i][j] = term elif term in puncs: if if_puncs: chunks[i][j] = term ## Bypassing by default else: chunks[i][j] = '' elif hgtk.checker.is_hangul(term): chunks[i][j] = term ## Bypassing by default else: if if_else: chunks[i][ j] = term # '' ## Currently bypassing but able to delete else: chunks[i][j] = '' return chunks