def stemmize(text, language=None, stop_words=None): """ Receive a string of text and return a list of stems. Args: text (str): A string of text to stemmize. language: Default language for stemmer and stop words. stop_words: A list of stop words. Set to False if you don't want to include the default list of stop words for the given language. """ stemmer = get_stemmer(language) words = split_words(text) words = stemmer.stemWords(words) if stop_words is False: return words if stop_words is None: stop_words = get_stop_words(language) else: stop_words = get_stop_words(stop_words) stop_stems = set(stemmer.stemWords(stop_words)) return [word for word in words if word not in stop_stems]
def stemmize(text, language=None, stop_words=None, ngrams=1): """ Receive a string of text and return a list of stems. Args: text (str): A string of text to stemize. language: Default language for stemmer and stop words. stop_words (list): List of stop tokens. ngrams (int): If given, uses n-grams instead of tokens. """ stemmer = get_stemmer(language) if stop_words is None: stop_words = get_stop_words(language) stop_stems = set(stemmer.stemWords(stop_words)) words = text.casefold().split() words = stemmer.stemWords([strip_punctuation(word) for word in words]) data = [w for w in words if w and w not in stop_stems] if ngrams == 1: return data else: result = [] for i in range(len(data) - ngrams + 1): words = data[i:i + ngrams] result.append(' '.join(words)) return result
def __init__(self, data=None, cls=TextDocument, stop_words=None): self.cls = cls self.data = OrderedDict(data or {}) for name, document in self.data.items(): self.data[name] = cls(document) # Register common tokens and the corresponding comment tokens. self.stop_words = get_stop_words(stop_words) self.comment_words = ['comment:' + w for w in self.stop_words] self.bad_metrics = self.stop_words + self.comment_words
def test_stop_words(): assert 'the' in get_stop_words() assert 'em' in get_stop_words('portuguese') assert 'def' not in get_stop_words('python')
from textblob import TextBlob from plagiarism import bag_of_words, stopwords, tokenizers from unidecode import unidecode STOPWORDS = stopwords.get_stop_words('portuguese') STOPWORDS += [ 'tribuna', 'orador', 'sr', 'falar', 'pronunciamento', 'v.exa', 'presidente', 'obrigado', 'é', 'deputado', 'srs', 'agradeço', 'agradecimento', 'sras', 'revisão', 'boa', 'tarde', 'v', 'exa' ] def speech_to_sentences(text): blob = TextBlob(text) return list(map(lambda x: str(x).casefold(), blob.sentences)) def normalize(text): return unidecode(text.lower().strip('.,:?!- ')) def extract(text): tokens = tokenizers.stemmize( text, language='portuguese', stop_words=STOPWORDS, ) features = bag_of_words(tokens, 'boolean') if len(tokens) > 1: features[' '.join(tokens)] = 1 return dict(features)