예제 #1
0
파일: ngrams_grid.py 프로젝트: Udzu/pudzu
def make_topwords(lang, letters):
    try:
        tw = pd.read_csv(f"../corpora/wikis/{lang}-topwords.csv", index_col=0)
    except FileNotFoundError:
        logger.info(f"Generating {lang} top words")
        pc = make_wordlist(lang)
        pc = pc / pc.sum()
        gb = pc.groupby(pc.index.map(lambda s: partial_strip(
            (
                fold_vietnamese(s) if lang=="vi" else hangul_jamo.decompose(s) if lang == "ko" else s
            ), letters)[0:1]))
        d = {c: [matches.index[0], matches[0]] for c in tqdm(letters)
             for matches in [gb.get_group(c) if c in gb.groups else []]
             if len(matches) > 0}
        tw = pd.DataFrame.from_dict(d, orient="index", columns=["word", "pc"])
        tw.to_csv(f"../corpora/wikis/{lang}-topwords.csv")
    if lang == "ko":
        tw = tw[~tw.word.map(lambda s: hangul_jamo.is_jamo_character(s[0]))]
    return tw
예제 #2
0
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
list = []
list2 = []
printlist = ""
year, month, day = map(int, input("생년월일을 입력하세요(ex 1993 10 08) : ").split())

this_year = 2018 - year

name = input("이름을 입력하세요 : ")

calendar_year = int(input("알고 싶은 해를 입력하세요 : "))
name2 = name[1:]
name1 = name[0]
name_dec = hangul_jamo.decompose(name)
name_dec1 = hangul_jamo.decompose(name1)
name_dec2 = hangul_jamo.decompose(name2)
sum = year + month + day
jaum_sum = 0
moum_sum = 0

for i in range(0, 80):
    sumx = str(sum + i)
    birthsum = 0
    for i in range(len(sumx)):
        birthsum += int(sumx[i])
    if birthsum > 22:
        birthsum = str(birthsum)
        birthsum = int(birthsum[0]) + int(birthsum[1])
    list.append(birthsum)
예제 #3
0
def split_jamos(word):
    return hangul_jamo.decompose(word)
예제 #4
0
def isGoodToken(token):
    #if (ucategory(token[0]) in {r'Lt', r'Lu'} and (lang != 'de')): return False
    if ((ucategory(token[0])[0] != r'L')
            and (ucategory(token[-1])[0] != r'L')):
        return False
    return token


if ((len(sys.argv) != 2)
        or ((not os.path.isfile(sys.argv[1]) and (sys.argv[1] != r'-')))):
    sys.stderr.write("Usage: '" + sys.argv[0] + "' TEXT_FILE|-\n")
    exit(1)
elif (sys.argv[1] == r'-'):
    bulk = sys.stdin.buffer.read().decode(
        errors='ignore').translate(junkToSpace)
    bulk = hangul_jamo.decompose(normalize('NFC', bulk.casefold()))
    bulk = [t for t in tokenize(re.sub(r'\s+', r' ', bulk)) if isGoodToken(t)]
    out = sys.__stdout__
else:
    bulk = re.sub(r'\s+', r' ',
                  open(sys.argv[1], 'rb').read().decode(errors='ignore'))
    bulk = [
        t for t in tokenize(bulk.translate(junkToSpace).casefold())
        if isGoodToken(t)
    ]
    bulk = [hangul_jamo.decompose(t) for t in bulk]
    out = open(sys.argv[1], 'w')

random.shuffle(bulk)
sys.stderr.write(str(len(bulk)) + "\n")
out.write(re.sub(r'\s+', r' ', (r' ' + r' '.join(bulk) + r' ')))
예제 #5
0
      compose_jamo_characters('ㄱ', 'ㅏ', None))
# compose_jamo_characters("ㄱ", "ㅏ", None) == 가
print('compose_jamo_characters("ㄱ", "ㅏ") ==',
      compose_jamo_characters('ㄱ', 'ㅏ'))
# compose_jamo_characters("ㄱ", "ㅏ") == 가
print('compose_jamo_characters("ㄱ", "ㅏ", "ㅎ") ==',
      compose_jamo_characters('ㄱ', 'ㅏ', 'ㅎ'))
# compose_jamo_characters("ㄱ", "ㅏ", "ㅎ") == 갛

# 4. DECOMPOSING HANGUL SYLLABLES
print('decompose_syllable("가") ==', decompose_syllable('가'))
# decompose_syllable("가") == ('ㄱ', 'ㅏ', None)
print('decompose_syllable("갛") ==', decompose_syllable('갛'))
# decompose_syllable("갛") == ('ㄱ', 'ㅏ', 'ㅎ')

# 4.1. USING UNPACKING ARGUMENTS OPERATOR *
print('compose_jamo_characters(*decompose_syllable("가")) ==',
      compose_jamo_characters(*decompose_syllable('가')))
# compose_jamo_characters(*decompose_syllable("가")) == 가
print('compose_jamo_characters(*decompose_syllable("갛")) ==',
      compose_jamo_characters(*decompose_syllable('갛')))
# compose_jamo_characters(*decompose_syllable("갛")) == 갛

# 5. COMPOSING TEXT
print('compose("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!") ==', compose('ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!'))
# compose("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!") == 안녕하세요! Hello!

# 6. DECOMPOSING TEXT
print('decompose("안녕하세요! Hello!") ==', decompose('안녕하세요! Hello!'))
# decompose("안녕하세요! Hello!") == ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ! Hello!
예제 #6
0
def test_decompose():
    assert decompose('대한민국은 민주공화국이다.') == 'ㄷㅐㅎㅏㄴㅁㅣㄴㄱㅜㄱㅇㅡㄴ ㅁㅣㄴㅈㅜㄱㅗㅇㅎㅘㄱㅜㄱㅇㅣㄷㅏ.'
    assert decompose(
        'Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof'
    ) == 'Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof'
예제 #7
0
import argparse
import hangul_jamo
import re

from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument('--input-file', type=str, required=True)
parser.add_argument('--output-file', type=str, required=True)
args = parser.parse_args()

do_jamo = 'kor' in args.input_file.lower()
# re_diacritics = re.compile(r"[ːᵝ̆ ͈ ̟ ̠ ̥ ̊ ̃ ̞ˀ˕̹]")
# the above has too many spaces
re_diacritics = re.compile(r"[ːᵝ͈̟̠̥̞̆̊̃ˀ˕̹]")

with open(args.input_file) as inf, open(args.output_file, 'w') as ouf:
    for line in tqdm(inf):
        grapheme, phoneme = line.strip().split('\t')
        phoneme = re_diacritics.sub('', phoneme).replace('w͍', 'w')
        if do_jamo:
            grapheme = hangul_jamo.decompose(grapheme)

        ouf.write(f"{grapheme}\t{phoneme}\n")