#### The scripts generates a stop word dictionary for Cantonese Corpus #### import pycantonese as pc import os #load HkCancor Corpus corpus = pc.hkcancor() stop = pc.stop_words() ## This function generate the stop word dictionary using the stops provided in HkCancor corpus def save_stopwords(file_path, tagged_words): directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) with open(file_path, 'w', encoding='utf8') as f: for word in tagged_words: word_line = word f.write(word_line + '\n') save_stopwords('data/stopwordCT.txt', 'data/init_dict.txt', stop)
def test_stop_words(): _stop_words = stop_words() assert "唔" in _stop_words
if __name__ == '__main__': corpus = pc.hkcancor() freq = corpus.word_frequency() save('cantonese-corpus/data/dict.txt', 'cantonese-corpus/data/init_dict.txt', corpus.tagged_words()) sourceFile = 'test_text.txt' targetFile = 'text_cut.txt' prepareData(sourceFile, targetFile) # sourceFile = 'neg_train.txt' # targetFile = 'neg_cut.txt' # prepareData(sourceFile, targetFile) # # # sourceFile = 'pos_train.txt' # targetFile = 'pos_cut.txt' # prepareData(sourceFile, targetFile) stop_words = pc.stop_words() # stopkey = [w.strip() for w in # codecs.open('/Users/gm/Xiu/5014B /酒店评论/data/stopWord.txt', 'r', encoding='utf-8').readlines()] sourceFile = 'text_cut.txt' targetFile = 'text_cut_stw.txt' stopWord(sourceFile, targetFile, stop_words) # sourceFile = 'neg_cut.txt' # targetFile = 'neg_cut_stw.txt' # stopWord(sourceFile, targetFile, stop_words) # # sourceFile = 'pos_cut.txt' # targetFile = 'pos_cut_stw.txt' # stopWord(sourceFile, targetFile, stop_words)
def test_stop_words_remove_multiple_words(): _stop_words = stop_words(remove=["唔", "乜嘢", "其他"]) assert not {"唔", "乜嘢", "其他"}.issubset(_stop_words) assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 3
def test_stop_words_add_multiple_words(): _stop_words = stop_words(add=["foo", "bar", "baz"]) assert {"foo", "bar", "baz"}.issubset(_stop_words) assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 3
def test_stop_words_remove_one_word(): _stop_words = stop_words(remove="唔") assert "唔" not in _stop_words assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 1
def test_stop_words_add_one_word(): _stop_words = stop_words(add="foobar") assert "foobar" in _stop_words assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 1
from pycantonese import stop_words _DEFAULT_STOP_WORDS = stop_words() def test_stop_words(): _stop_words = stop_words() assert "唔" in _stop_words def test_stop_words_add_one_word(): _stop_words = stop_words(add="foobar") assert "foobar" in _stop_words assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 1 def test_stop_words_remove_one_word(): _stop_words = stop_words(remove="唔") assert "唔" not in _stop_words assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 1 def test_stop_words_add_multiple_words(): _stop_words = stop_words(add=["foo", "bar", "baz"]) assert {"foo", "bar", "baz"}.issubset(_stop_words) assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 3 def test_stop_words_remove_multiple_words(): _stop_words = stop_words(remove=["唔", "乜嘢", "其他"]) assert not {"唔", "乜嘢", "其他"}.issubset(_stop_words)