Python stop_words示例

编程语言: Python

命名空间/包名称: pycantonese

方法/功能: stop_words

hotexamples.com的示例: 8

Python stop_words - 已找到8个示例。这些是从开源项目中提取的最受好评的pycantonese.stop_words现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

#### The scripts generates a stop word dictionary for Cantonese Corpus ####
import pycantonese as pc

import os

#load HkCancor Corpus
corpus = pc.hkcancor()

stop = pc.stop_words()


## This function generate the stop word dictionary using the stops provided in HkCancor corpus
def save_stopwords(file_path, tagged_words):

    directory = os.path.dirname(file_path)

    if not os.path.exists(directory):

        os.makedirs(directory)

    with open(file_path, 'w', encoding='utf8') as f:

        for word in tagged_words:

            word_line = word

            f.write(word_line + '\n')


save_stopwords('data/stopwordCT.txt', 'data/init_dict.txt', stop)

示例#2

显示文件

文件： test_stop_words.py 项目： kwx4github/pycantonese

def test_stop_words():
    _stop_words = stop_words()
    assert "唔" in _stop_words

示例#3

显示文件

if __name__ == '__main__':
    corpus = pc.hkcancor()
    freq = corpus.word_frequency()
    save('cantonese-corpus/data/dict.txt', 'cantonese-corpus/data/init_dict.txt', corpus.tagged_words())
    sourceFile = 'test_text.txt'
    targetFile = 'text_cut.txt'
    prepareData(sourceFile, targetFile)
    # sourceFile = 'neg_train.txt'
    # targetFile = 'neg_cut.txt'
    # prepareData(sourceFile, targetFile)
    #
    #
    # sourceFile = 'pos_train.txt'
    # targetFile = 'pos_cut.txt'
    # prepareData(sourceFile, targetFile)

    stop_words = pc.stop_words()
    # stopkey = [w.strip() for w in
    #            codecs.open('/Users/gm/Xiu/5014B /酒店评论/data/stopWord.txt', 'r', encoding='utf-8').readlines()]
    sourceFile = 'text_cut.txt'
    targetFile = 'text_cut_stw.txt'
    stopWord(sourceFile, targetFile, stop_words)
    # sourceFile = 'neg_cut.txt'
    # targetFile = 'neg_cut_stw.txt'
    # stopWord(sourceFile, targetFile, stop_words)
    #
    # sourceFile = 'pos_cut.txt'
    # targetFile = 'pos_cut_stw.txt'
    # stopWord(sourceFile, targetFile, stop_words)

示例#4

显示文件

文件： test_stop_words.py 项目： kwx4github/pycantonese

def test_stop_words_remove_multiple_words():
    _stop_words = stop_words(remove=["唔", "乜嘢", "其他"])
    assert not {"唔", "乜嘢", "其他"}.issubset(_stop_words)
    assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 3

示例#5

显示文件

文件： test_stop_words.py 项目： kwx4github/pycantonese

def test_stop_words_add_multiple_words():
    _stop_words = stop_words(add=["foo", "bar", "baz"])
    assert {"foo", "bar", "baz"}.issubset(_stop_words)
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 3

示例#6

显示文件

文件： test_stop_words.py 项目： kwx4github/pycantonese

def test_stop_words_remove_one_word():
    _stop_words = stop_words(remove="唔")
    assert "唔" not in _stop_words
    assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 1

示例#7

显示文件

文件： test_stop_words.py 项目： kwx4github/pycantonese

def test_stop_words_add_one_word():
    _stop_words = stop_words(add="foobar")
    assert "foobar" in _stop_words
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 1

示例#8

显示文件

文件： test_stop_words.py 项目： kwx4github/pycantonese

from pycantonese import stop_words

_DEFAULT_STOP_WORDS = stop_words()


def test_stop_words():
    _stop_words = stop_words()
    assert "唔" in _stop_words


def test_stop_words_add_one_word():
    _stop_words = stop_words(add="foobar")
    assert "foobar" in _stop_words
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 1


def test_stop_words_remove_one_word():
    _stop_words = stop_words(remove="唔")
    assert "唔" not in _stop_words
    assert len(_DEFAULT_STOP_WORDS) - len(_stop_words) == 1


def test_stop_words_add_multiple_words():
    _stop_words = stop_words(add=["foo", "bar", "baz"])
    assert {"foo", "bar", "baz"}.issubset(_stop_words)
    assert len(_stop_words) - len(_DEFAULT_STOP_WORDS) == 3


def test_stop_words_remove_multiple_words():
    _stop_words = stop_words(remove=["唔", "乜嘢", "其他"])
    assert not {"唔", "乜嘢", "其他"}.issubset(_stop_words)