Python get_stop_words示例

编程语言: Python

命名空间/包名称: plagiarism.stopwords

方法/功能: get_stop_words

hotexamples.com的示例: 5

Python get_stop_words - 已找到5个示例。这些是从开源项目中提取的最受好评的plagiarism.stopwords.get_stop_words现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def stemmize(text, language=None, stop_words=None):
    """
    Receive a string of text and return a list of stems.

    Args:
        text (str):
            A string of text to stemmize.
        language:
            Default language for stemmer and stop words.
        stop_words:
            A list of stop words. Set to False if you don't want to include the
            default list of stop words for the given language.
    """

    stemmer = get_stemmer(language)
    words = split_words(text)
    words = stemmer.stemWords(words)
    if stop_words is False:
        return words
    if stop_words is None:
        stop_words = get_stop_words(language)
    else:
        stop_words = get_stop_words(stop_words)
    stop_stems = set(stemmer.stemWords(stop_words))
    return [word for word in words if word not in stop_stems]

示例#2

显示文件

文件： tokenizers.py 项目： RahulKrg14/ML_Plagiarism_Detection

def stemmize(text, language=None, stop_words=None, ngrams=1):
    """
    Receive a string of text and return a list of stems.

    Args:
        text (str):
            A string of text to stemize.
        language:
            Default language for stemmer and stop words.
        stop_words (list):
            List of stop tokens.
        ngrams (int):
            If given, uses n-grams instead of tokens.
    """
    stemmer = get_stemmer(language)

    if stop_words is None:
        stop_words = get_stop_words(language)
    stop_stems = set(stemmer.stemWords(stop_words))
    words = text.casefold().split()
    words = stemmer.stemWords([strip_punctuation(word) for word in words])
    data = [w for w in words if w and w not in stop_stems]
    if ngrams == 1:
        return data
    else:
        result = []
        for i in range(len(data) - ngrams + 1):
            words = data[i:i + ngrams]
            result.append(' '.join(words))
        return result

示例#3

显示文件

文件： core.py 项目： msfernandes/plagiarism

    def __init__(self, data=None, cls=TextDocument, stop_words=None):
        self.cls = cls
        self.data = OrderedDict(data or {})
        for name, document in self.data.items():
            self.data[name] = cls(document)

        # Register common tokens and the corresponding comment tokens.
        self.stop_words = get_stop_words(stop_words)
        self.comment_words = ['comment:' + w for w in self.stop_words]
        self.bad_metrics = self.stop_words + self.comment_words

示例#4

显示文件

文件： test_stop_words.py 项目： RahulKrg14/ML_Plagiarism_Detection

def test_stop_words():
    assert 'the' in get_stop_words()
    assert 'em' in get_stop_words('portuguese')
    assert 'def' not in get_stop_words('python')

示例#5

显示文件

from textblob import TextBlob
from plagiarism import bag_of_words, stopwords, tokenizers
from unidecode import unidecode

STOPWORDS = stopwords.get_stop_words('portuguese')
STOPWORDS += [
    'tribuna', 'orador', 'sr', 'falar', 'pronunciamento', 'v.exa',
    'presidente', 'obrigado', 'é', 'deputado', 'srs', 'agradeço',
    'agradecimento', 'sras', 'revisão', 'boa', 'tarde', 'v', 'exa'
]


def speech_to_sentences(text):
    blob = TextBlob(text)
    return list(map(lambda x: str(x).casefold(), blob.sentences))


def normalize(text):
    return unidecode(text.lower().strip('.,:?!- '))


def extract(text):
    tokens = tokenizers.stemmize(
        text,
        language='portuguese',
        stop_words=STOPWORDS,
    )
    features = bag_of_words(tokens, 'boolean')
    if len(tokens) > 1:
        features[' '.join(tokens)] = 1
    return dict(features)