Python hkcancor 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pycantonese

메소드/함수: hkcancor

hotexamples.com에서의 예제들: 6

Python hkcancor - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pycantonese.hkcancor에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: 01-read.py 프로젝트: tszhim-tsui/2019-SunYatSen

# -*- coding: utf-8 -*-

import pycantonese as pc
import csv

corpus = pc.hkcancor()
hkcancorFreq = corpus.word_frequency()

with open('hkcancorFrequency.csv', encoding='utf-8', mode='w',
          newline='') as csvfile:
    writer = csv.writer(csvfile)
    for key, value in hkcancorFreq.items():
        writer.writerow([key, value])

sun = pc.read_chat('../00-source/sun_1924_tagged.cha')
sun.word_frequency()
sunFreq = sun.word_frequency()

with open('sunFrequency.csv', encoding='utf-8', mode='w',
          newline='') as csvfile:
    writer = csv.writer(csvfile)
    for key, value in sunFreq.items():
        writer.writerow([key, value])

예제 #2

파일 보기

파일: test_corpus.py 프로젝트: standby/pycantonese

import sys

import pytest

import pycantonese

HKCANCOR = pycantonese.hkcancor()


def almost_equal(x, y, tolerance):
    # Don't bother to import numpy's assert_almost_equal just for testing
    return abs(x - y) <= tolerance


def test_hkcancor_word_count():
    assert almost_equal(len(HKCANCOR.words()), 149781, tolerance=3)


@pytest.mark.skipif(sys.version_info[0] == 2,
                    reason='character/unicode parsing not yet fixed '
                    'for python 2.7')
def test_hkcancor_character_count():
    assert almost_equal(len(HKCANCOR.characters()), 186888, tolerance=3)

예제 #3

파일 보기

import sys
from collections import Counter

import json
from memoize import memoize
import pinyin
import jyutping
import pycantonese

corpus = pycantonese.hkcancor()
#from hanziconv import HanziConv
from opencc import OpenCC

s2hk = OpenCC('s2hk').convert

from mkdict import pinyin_to_zhuyin_real as pinyin_to_zhuyin
from mkdict import get_all_yue, get_merged_entries


def get_contents_in_dictionary(dictfile):
    lines = open(dictfile).readlines()
    output = []
    is_started = False
    for line in lines:
        x = line.strip()
        if x == '...':
            is_started = True
            continue
        if not is_started:
            continue
        output.append(line)

예제 #4

파일 보기

파일: train_tagger.py 프로젝트: kwx4github/pycantonese

def main():
    corpus = hkcancor()
    tagger = POSTagger(**_TAGGER_PARAMETERS)
    tagger.train(_get_tagged_sents(corpus), save=_PICKLE_PATH)

예제 #5

파일 보기

#loading packages
import pycantonese as pc
import xlsxwriter
import collections

print("Please read the code and change directories accordingly (see line 9).")

#setting up workbook
workbook = xlsxwriter.Workbook('hkcancor_word.xlsx')
worksheet = workbook.add_worksheet('words')

#loading files
c = pc.hkcancor()

#getting words
tokens = c.search(tone='[1-6]')

#word frequency
word_count = collections.Counter(tokens)

#writing data
row = 1
worksheet.write(0, 0, 'word')
worksheet.write(0, 1, 'jyutping')
worksheet.write(0, 2, 'occured')
worksheet.write(0, 3, 'ratio')

word_sum = sum(word_count.values())
for i in word_count.keys():
    worksheet.write(row, 0, i[0])
    worksheet.write(row, 1, i[2])

예제 #6

파일 보기

파일: train_tagger.py 프로젝트: chaaklau/pycantonese

def _get_tagged_sents():
    return [
        [(token.word, _FIX_HKCANCOR_TAGS.get(token.pos, token.pos)) for token in tokens]
        for tokens in hkcancor().tokens(by_utterances=True)
    ]