예제 #1
0
def stemmize(text, language=None, stop_words=None):
    """
    Receive a string of text and return a list of stems.

    Args:
        text (str):
            A string of text to stemmize.
        language:
            Default language for stemmer and stop words.
        stop_words:
            A list of stop words. Set to False if you don't want to include the
            default list of stop words for the given language.
    """

    stemmer = get_stemmer(language)
    words = split_words(text)
    words = stemmer.stemWords(words)
    if stop_words is False:
        return words
    if stop_words is None:
        stop_words = get_stop_words(language)
    else:
        stop_words = get_stop_words(stop_words)
    stop_stems = set(stemmer.stemWords(stop_words))
    return [word for word in words if word not in stop_stems]
def stemmize(text, language=None, stop_words=None, ngrams=1):
    """
    Receive a string of text and return a list of stems.

    Args:
        text (str):
            A string of text to stemize.
        language:
            Default language for stemmer and stop words.
        stop_words (list):
            List of stop tokens.
        ngrams (int):
            If given, uses n-grams instead of tokens.
    """
    stemmer = get_stemmer(language)

    if stop_words is None:
        stop_words = get_stop_words(language)
    stop_stems = set(stemmer.stemWords(stop_words))
    words = text.casefold().split()
    words = stemmer.stemWords([strip_punctuation(word) for word in words])
    data = [w for w in words if w and w not in stop_stems]
    if ngrams == 1:
        return data
    else:
        result = []
        for i in range(len(data) - ngrams + 1):
            words = data[i:i + ngrams]
            result.append(' '.join(words))
        return result
예제 #3
0
    def __init__(self, data=None, cls=TextDocument, stop_words=None):
        self.cls = cls
        self.data = OrderedDict(data or {})
        for name, document in self.data.items():
            self.data[name] = cls(document)

        # Register common tokens and the corresponding comment tokens.
        self.stop_words = get_stop_words(stop_words)
        self.comment_words = ['comment:' + w for w in self.stop_words]
        self.bad_metrics = self.stop_words + self.comment_words
def test_stop_words():
    assert 'the' in get_stop_words()
    assert 'em' in get_stop_words('portuguese')
    assert 'def' not in get_stop_words('python')
예제 #5
0
from textblob import TextBlob
from plagiarism import bag_of_words, stopwords, tokenizers
from unidecode import unidecode

STOPWORDS = stopwords.get_stop_words('portuguese')
STOPWORDS += [
    'tribuna', 'orador', 'sr', 'falar', 'pronunciamento', 'v.exa',
    'presidente', 'obrigado', 'é', 'deputado', 'srs', 'agradeço',
    'agradecimento', 'sras', 'revisão', 'boa', 'tarde', 'v', 'exa'
]


def speech_to_sentences(text):
    blob = TextBlob(text)
    return list(map(lambda x: str(x).casefold(), blob.sentences))


def normalize(text):
    return unidecode(text.lower().strip('.,:?!- '))


def extract(text):
    tokens = tokenizers.stemmize(
        text,
        language='portuguese',
        stop_words=STOPWORDS,
    )
    features = bag_of_words(tokens, 'boolean')
    if len(tokens) > 1:
        features[' '.join(tokens)] = 1
    return dict(features)