Exemplo n.º 1
0
def make_topwords(lang, letters):
    try:
        tw = pd.read_csv(f"../corpora/wikis/{lang}-topwords.csv", index_col=0)
    except FileNotFoundError:
        logger.info(f"Generating {lang} top words")
        pc = make_wordlist(lang)
        pc = pc / pc.sum()
        gb = pc.groupby(pc.index.map(lambda s: partial_strip(
            (
                fold_vietnamese(s) if lang=="vi" else hangul_jamo.decompose(s) if lang == "ko" else s
            ), letters)[0:1]))
        d = {c: [matches.index[0], matches[0]] for c in tqdm(letters)
             for matches in [gb.get_group(c) if c in gb.groups else []]
             if len(matches) > 0}
        tw = pd.DataFrame.from_dict(d, orient="index", columns=["word", "pc"])
        tw.to_csv(f"../corpora/wikis/{lang}-topwords.csv")
    if lang == "ko":
        tw = tw[~tw.word.map(lambda s: hangul_jamo.is_jamo_character(s[0]))]
    return tw
Exemplo n.º 2
0
def test_is_not_jamo_character():
    assert not is_jamo_character('0')
    assert not is_jamo_character('A')
    assert not is_jamo_character('a')
Exemplo n.º 3
0
from hangul_jamo import is_syllable, is_jamo_character, compose_jamo_characters, decompose_syllable, compose, decompose

# 1. CHECKING HANGUL SYLLABLES
print('is_syllable("가") ==', is_syllable('가'))
# is_syllable("가") == True
print('is_syllable("갛") ==', is_syllable('갛'))
# is_syllable("갛") == True
print('is_syllable("ㄱ") ==', is_syllable('ㄱ'))
# is_syllable("ㄱ") == False

# 2. CHECKING HANGUL JAMO CHARACTERS
print('is_jamo_character("ㄱ") ==', is_jamo_character('ㄱ'))
# is_jamo_character("ㄱ") == True
print('is_jamo_character("ㅏ") ==', is_jamo_character('ㅏ'))
# is_jamo_character("ㅏ") == True
print('is_jamo_character("ㄳ") ==', is_jamo_character('ㄳ'))
# is_jamo_character("ㄳ") == True
print('is_jamo_character("갃") ==', is_jamo_character('갃'))
# is_jamo_character("갃") == False

# 3. COMPOSING HANGUL JAMO CHARACTERS
print('compose_jamo_characters("ㄱ", "ㅏ", None) ==',
      compose_jamo_characters('ㄱ', 'ㅏ', None))
# compose_jamo_characters("ㄱ", "ㅏ", None) == 가
print('compose_jamo_characters("ㄱ", "ㅏ") ==',
      compose_jamo_characters('ㄱ', 'ㅏ'))
# compose_jamo_characters("ㄱ", "ㅏ") == 가
print('compose_jamo_characters("ㄱ", "ㅏ", "ㅎ") ==',
      compose_jamo_characters('ㄱ', 'ㅏ', 'ㅎ'))
# compose_jamo_characters("ㄱ", "ㅏ", "ㅎ") == 갛
Exemplo n.º 4
0
def test_is_jamo_character():
    assert is_jamo_character('ㄱ')
    assert is_jamo_character('ㅏ')
    assert is_jamo_character('ㄳ')
    assert is_jamo_character(None)