def j2h(text): i = 0 result = [] while i < len(text): if i + 2 < len(text) and text[i] in LEADS and text[ i + 1] in VOWELS and text[i + 2] in TAILS: result.append(jamo.j2h(text[i], text[i + 1], text[i + 2])) i += 3 elif i + 1 < len(text) and text[i] in LEADS and text[i + 1] in VOWELS: result.append(jamo.j2h(text[i], text[i + 1])) i += 2 else: result.append(text[i]) i += 1 return "".join(result)
def _get_text_from_candidates(candidates): if len(candidates) == 0: return "" elif len(candidates) == 1: return _jamo_char_to_hcj(candidates[0]) else: return j2h(**dict(zip(["lead", "vowel", "tail"], candidates)))
def test_j2h(self): """j2h hardcoded tests. Arguments may be integers corresponding to the U+11xx codepoints, the actual U+11xx jamo characters, or HCJ. Outputs a one-character Hangul string. This function is defined solely for naming conisistency with jamo_to_hangul. """ assert jamo.j2h('ㅎ', 'ㅏ', 'ㄴ') == "한",\ "j2h doesn't work. Hint: it's the same as jamo_to_hangul." assert jamo.j2h('ㅎ', 'ㅏ') == "하",\ "j2h doesn't work. Hint: it's the same as jamo_to_hangul."
def encode(self, text): text = self.preprocess(text, vowel_type=self.vowel_type) words = [] for word in TreebankWordTokenizer().tokenize(text): from reo_toolkit import is_maori if not is_maori(word): words.append(word) continue encoded_text = [] for syllable in self.tokenize(word): if not all(ch in alphabet for ch in syllable): encoded_text.append(syllable) continue if syllable in vowels: syllable = 'x' + syllable try: consonant, vowel = ''.join( [self.encoder_dict[ch] for ch in syllable]) except KeyError: logging.error( "KeyError: phoneme {} not in encoder_dict".format( syllable)) raise KeyError try: encoded = jamo.j2h(consonant, vowel) except jamo.InvalidJamoError: logging.error( 'InvalidJamoError - Consonant={} Vowel={} Syllable={}'. format(consonant, vowel, syllable)) encoded_text.append(encoded) words.append(''.join(encoded_text)) return TreebankWordDetokenizer().detokenize(words)
def get_text_from_candidates(candidates): if len(candidates) == 0: return "" elif len(candidates) == 1: return _jamo_char_to_hcj(candidates[0]) else: return j2h(**dict(zip(["chosung", "jungsung", "jongsung"], candidates)))
def j2syl(string): choseong = "[\u1100-\u1112]" jungseong = "[\u1161-\u1175]" jongseong = "[\u11A8-\u11C2]" # CVC matches = re.findall(f"{choseong}{jungseong}{jongseong}", string) for match in matches: syl = j2h(*match) string = string.replace(match, syl) # CV matches = re.findall(f"{choseong}{jungseong}", string) for match in matches: syl = j2h(*match) string = string.replace(match, syl) return string
def compose(letters): # insert placeholder letters = re.sub("(^|[^\u1100-\u1112])([\u1161-\u1175])", r"\1ᄋ\2", letters) string = letters # assembled characters # c+v+c syls = set( re.findall("[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]", string)) for syl in syls: string = string.replace(syl, j2h(*syl)) # c+v syls = set(re.findall("[\u1100-\u1112][\u1161-\u1175]", string)) for syl in syls: string = string.replace(syl, j2h(*syl)) return string
def compose(self, string): string = self.contract(string) choseong = "[\u1100-\u1112]" jungseong = "[\u1161-\u1175]" jongseong = "[\u11A8-\u11C2]" # CVC first matches = re.findall(f"{choseong}{jungseong}{jongseong}", string) for match in matches: syl = j2h(*match) string = string.replace(match, syl) # CV matches = re.findall(f"{choseong}{jungseong}", string) for match in matches: syl = j2h(*match) string = string.replace(match, syl) return string