示例#1
0
#### The scripts generates a stop word dictionary for Cantonese Corpus ####
import pycantonese as pc

import os

#load HkCancor Corpus
corpus = pc.hkcancor()

stop = pc.stop_words()


## This function generate the stop word dictionary using the stops provided in HkCancor corpus
def save_stopwords(file_path, tagged_words):

    directory = os.path.dirname(file_path)

    if not os.path.exists(directory):

        os.makedirs(directory)

    with open(file_path, 'w', encoding='utf8') as f:

        for word in tagged_words:

            word_line = word

            f.write(word_line + '\n')


save_stopwords('data/stopwordCT.txt', 'data/init_dict.txt', stop)
示例#2
0
def test_stop_words():
    _stop_words = stop_words()
    assert "唔" in _stop_words
示例#3
0
if __name__ == '__main__':
    corpus = pc.hkcancor()
    freq = corpus.word_frequency()
    save('cantonese-corpus/data/dict.txt', 'cantonese-corpus/data/init_dict.txt', corpus.tagged_words())
    sourceFile = 'test_text.txt'
    targetFile = 'text_cut.txt'
    prepareData(sourceFile, targetFile)
    # sourceFile = 'neg_train.txt'
    # targetFile = 'neg_cut.txt'
    # prepareData(sourceFile, targetFile)
    #
    #
    # sourceFile = 'pos_train.txt'
    # targetFile = 'pos_cut.txt'
    # prepareData(sourceFile, targetFile)

    stop_words = pc.stop_words()
    # stopkey = [w.strip() for w in
    #            codecs.open('/Users/gm/Xiu/5014B /酒店评论/data/stopWord.txt', 'r', encoding='utf-8').readlines()]
    sourceFile = 'text_cut.txt'
    targetFile = 'text_cut_stw.txt'
    stopWord(sourceFile, targetFile, stop_words)
    # sourceFile = 'neg_cut.txt'
    # targetFile = 'neg_cut_stw.txt'
    # stopWord(sourceFile, targetFile, stop_words)
    #
    # sourceFile = 'pos_cut.txt'
    # targetFile = 'pos_cut_stw.txt'
    # stopWord(sourceFile, targetFile, stop_words)

示例#4
0
def test_stop_words_remove_multiple_words():
    _stop_words = stop_words(remove=["唔", "乜嘢", "其他"])
    assert not {"唔", "乜嘢", "其他"}.issubset(_stop_words)
    assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 3
示例#5
0
def test_stop_words_add_multiple_words():
    _stop_words = stop_words(add=["foo", "bar", "baz"])
    assert {"foo", "bar", "baz"}.issubset(_stop_words)
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 3
示例#6
0
def test_stop_words_remove_one_word():
    _stop_words = stop_words(remove="唔")
    assert "唔" not in _stop_words
    assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 1
示例#7
0
def test_stop_words_add_one_word():
    _stop_words = stop_words(add="foobar")
    assert "foobar" in _stop_words
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 1
示例#8
0
from pycantonese import stop_words

_DEFAULT_STOP_WORDS = stop_words()


def test_stop_words():
    _stop_words = stop_words()
    assert "唔" in _stop_words


def test_stop_words_add_one_word():
    _stop_words = stop_words(add="foobar")
    assert "foobar" in _stop_words
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 1


def test_stop_words_remove_one_word():
    _stop_words = stop_words(remove="唔")
    assert "唔" not in _stop_words
    assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 1


def test_stop_words_add_multiple_words():
    _stop_words = stop_words(add=["foo", "bar", "baz"])
    assert {"foo", "bar", "baz"}.issubset(_stop_words)
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 3


def test_stop_words_remove_multiple_words():
    _stop_words = stop_words(remove=["唔", "乜嘢", "其他"])
    assert not {"唔", "乜嘢", "其他"}.issubset(_stop_words)