예제 #1
0
# -*- coding: utf-8 -*-

import pycantonese as pc
import csv

corpus = pc.hkcancor()
hkcancorFreq = corpus.word_frequency()

with open('hkcancorFrequency.csv', encoding='utf-8', mode='w',
          newline='') as csvfile:
    writer = csv.writer(csvfile)
    for key, value in hkcancorFreq.items():
        writer.writerow([key, value])

sun = pc.read_chat('../00-source/sun_1924_tagged.cha')
sun.word_frequency()
sunFreq = sun.word_frequency()

with open('sunFrequency.csv', encoding='utf-8', mode='w',
          newline='') as csvfile:
    writer = csv.writer(csvfile)
    for key, value in sunFreq.items():
        writer.writerow([key, value])
예제 #2
0
import sys

import pytest

import pycantonese

HKCANCOR = pycantonese.hkcancor()


def almost_equal(x, y, tolerance):
    # Don't bother to import numpy's assert_almost_equal just for testing
    return abs(x - y) <= tolerance


def test_hkcancor_word_count():
    assert almost_equal(len(HKCANCOR.words()), 149781, tolerance=3)


@pytest.mark.skipif(sys.version_info[0] == 2,
                    reason='character/unicode parsing not yet fixed '
                    'for python 2.7')
def test_hkcancor_character_count():
    assert almost_equal(len(HKCANCOR.characters()), 186888, tolerance=3)
예제 #3
0
import sys
from collections import Counter

import json
from memoize import memoize
import pinyin
import jyutping
import pycantonese

corpus = pycantonese.hkcancor()
#from hanziconv import HanziConv
from opencc import OpenCC

s2hk = OpenCC('s2hk').convert

from mkdict import pinyin_to_zhuyin_real as pinyin_to_zhuyin
from mkdict import get_all_yue, get_merged_entries


def get_contents_in_dictionary(dictfile):
    lines = open(dictfile).readlines()
    output = []
    is_started = False
    for line in lines:
        x = line.strip()
        if x == '...':
            is_started = True
            continue
        if not is_started:
            continue
        output.append(line)
예제 #4
0
def main():
    corpus = hkcancor()
    tagger = POSTagger(**_TAGGER_PARAMETERS)
    tagger.train(_get_tagged_sents(corpus), save=_PICKLE_PATH)
예제 #5
0
#loading packages
import pycantonese as pc
import xlsxwriter
import collections

print("Please read the code and change directories accordingly (see line 9).")

#setting up workbook
workbook = xlsxwriter.Workbook('hkcancor_word.xlsx')
worksheet = workbook.add_worksheet('words')

#loading files
c = pc.hkcancor()

#getting words
tokens = c.search(tone='[1-6]')

#word frequency
word_count = collections.Counter(tokens)

#writing data
row = 1
worksheet.write(0, 0, 'word')
worksheet.write(0, 1, 'jyutping')
worksheet.write(0, 2, 'occured')
worksheet.write(0, 3, 'ratio')

word_sum = sum(word_count.values())
for i in word_count.keys():
    worksheet.write(row, 0, i[0])
    worksheet.write(row, 1, i[2])
예제 #6
0
def _get_tagged_sents():
    return [
        [(token.word, _FIX_HKCANCOR_TAGS.get(token.pos, token.pos)) for token in tokens]
        for tokens in hkcancor().tokens(by_utterances=True)
    ]