def stem_words(string, sep=" "): """Stem words in a string. The function tries first to stem the word in Italian if nothing happens then switch to English. Parameters ---------- string : string a string containing words separated by some separator sep : string a separator (default ' ') Returns ------- A string stemmed """ stemmerIta = stem.SnowballStemmer("italian") stemmerEng = stem.SnowballStemmer("english") string = string.split(sep) string = [ stemmerIta.stem(i) if stemmerIta.stem(i) != i else stemmerEng.stem(i) for i in string ] return sep.join(string)
def clean_text(text): """ text: a string return: modified initial string """ #text = clean_tweet(text) print(text, '\n') text = text.lower() # lowercase text print(text, '\n') text = REPLACE_BY_SPACE_RE.sub( ' ', text ) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space. print(text, '\n') text = BAD_SYMBOLS_RE.sub( ' ', text ) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. print(text, '\n') text = text.replace('rt', ' ') print(text, '\n') text = re.sub(r'\d+', '', text) text = text.replace('&#', ' ') print(text, '\n') text = re.sub(r'\W+', ' ', text) #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text text = text.split() lemm = Stem.SnowballStemmer("english") lemm_words = [lemm.stem(word) for word in text] text = " ".join(lemm_words) print(text, '\n') return text
def __init__(self, weight_gpop=0, stemm=True, stemmer='porter', tokenize=True, clean=True, synonyms=False, fuzzy=True, fuzz_thres=0, add_artists=False, add_albums=False, return_num_predictions=500): ''' Constructor ''' self.weight_gpop = weight_gpop self.return_num_predictions = return_num_predictions self.add_artists = add_artists self.add_albums = add_albums self.stemm = stemm self.tokenize = tokenize self.clean = clean self.fuzzy = fuzzy self.fuzz_thres = fuzz_thres if stemmer == 'wn': self.stemmer = stem.WordNetLemmatizer() elif stemmer == 'porter': self.stemmer = stem.PorterStemmer() elif stemmer == 'snowball': self.stemmer = stem.SnowballStemmer('english') self.stemmers = stemmer self.synonyms = synonyms
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def is_unique(self, word): stemmer = stem.SnowballStemmer("english") root = stemmer.stem(word) for card in self.cards: if root == stemmer.stem(card.word): return False return True
def reduz_ao_radical_stem(palavras): snow = stem.SnowballStemmer('english') if type(palavras) is str: return snow.stem(palavras) if type(palavras) is list: for i in range(len(palavras)): palavras[i] = snow.stem(palavras[i]) return palavras
def stemwords(words, stemmer=None): if not stemmer: stemmer = stem.SnowballStemmer('english') stemmed_word = list() for i in range(len(words)): words[i] = stemmer.stem(words[i]) return words
def stemming(x): stemmer = stem.SnowballStemmer("english") words = x.split() doc = [] for word in words: word = stemmer.stem(word) doc.append(word) return " ".join(doc)
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Try lemmatization filter try: nltk.find('taggers/universal_tagset') except LookupError: nltk.download('universal_tagset') try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stim = ComplexTextStim(text='These are tests for Stemming filters') filt = WordStemmingFilter(stemmer='wordnet') lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['these', 'be', 'test', 'for', 'stem', 'filter'] assert lemmas == target # Try case sensitive filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True) lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['These', 'be', 'test', 'for', 'Stemming', 'filter'] assert lemmas == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def __init__(self, stemmer_type): if stemmer_type == 'porter': self.stemmer = stemming.PorterStemmer() elif stemmer_type == 'snowball': self.stemmer = stemming.SnowballStemmer('english') elif stemmer_type == 'lemmatize': self.stemmer = WordNetStemmer() else: raise NameError('\'%s\' not supported' % stemmer_type)
def stem_words(msg): from nltk import stem stemmer = stem.SnowballStemmer('english') msg = [word for word in msg.split()] msg = " ".join([stemmer.stem(word) for word in msg]) return msg
def __init__(self): self.n_grams = [1, 2, 3, 4] self.stem_generator = stem.SnowballStemmer("english") self.total_num = 0 self.generate_corpus = [] self.reference_corpus = [] self.gen_corpus_count = [] self.ref_corpus_count = [] self.gen_document_frequency = defaultdict(int) self.ref_document_frequency = defaultdict(int)
def english_token(sentence, tokenize_flag=1, is_filter_stopword=1, stem_flag=1, lemma_flag=1): # 两种英文分词方式, 2更优 if tokenize_flag == 1: source_tokens = word_tokenize(sentence) elif tokenize_flag == 2: tokenizer = tokenize.WordPunctTokenizer() source_tokens = tokenizer.tokenize(sentence) # print(source_tokens) # 删除标点符号 for token in source_tokens[::-1]: if len(token) == 1 and token[0].isalpha() == False: source_tokens.remove(token) # 过滤停用词 if is_filter_stopword: list_stopWords = list(set(corpus.stopwords.words('english'))) filtered_stop_words = [w for w in source_tokens if not w in list_stopWords] else: filtered_stop_words = source_tokens # print(filtered_stop_words) # 两种词干化处理工具,2更优 stem_tokens = [] if stem_flag == 1: porterStemmer = stem.PorterStemmer() for word in filtered_stop_words: stem_tokens.append(porterStemmer.stem(word)) elif stem_flag == 2: snowballStemmer = stem.SnowballStemmer('english') for word in filtered_stop_words: stem_tokens.append(snowballStemmer.stem(word)) # 将动名词词型还原,2更优 lemma_tokens = [] if lemma_flag == 1: lemmatizer = stem.WordNetLemmatizer() for word in stem_tokens: # 将名词还原为单数形式 n_lemma = lemmatizer.lemmatize(word, pos='n') # 将动词还原为原型形式 v_lemma = lemmatizer.lemmatize(n_lemma, pos='v') # print('%8s %8s %8s' % (word, n_lemma, v_lemma)) lemma_tokens.append(v_lemma) elif lemma_flag == 2: lemmatizer = stem.wordnet.WordNetLemmatizer() tagged_corpus = pos_tag(stem_tokens) for token, tag in tagged_corpus: if tag[0].lower() in ['n', 'v']: lemma_tokens.append(lemmatizer.lemmatize(token, tag[0].lower())) else: lemma_tokens.append(token) return lemma_tokens
def preprocess(News: List[str]): """ :param News: 新闻列表 :return: 词根化处理之后的新闻列表,词根化需要分词但是后续的tf-idf-transform要一个整句子所以我分词后又拼回去了, 返回List[str] """ print("Now start to stemming tokens in news...") sb_stemmer = ns.SnowballStemmer("english") stem_news = [] for doc in tqdm(News): words = tk.word_tokenize(doc) words = [sb_stemmer.stem(each) for each in words] stem_news.append(' '.join(words)) return stem_news
def review_messages(texts): stoplist = stopwords.words('english') stoplist.append('Subject') stemmer = stem.SnowballStemmer('english') words = process_email(texts) msg = [] for word in words: if word not in stoplist: msg.append(word) # using a stemmer msg = " ".join([stemmer.stem(word) for word in msg]) return msg
def tokenize_comment(comment, voc, voc_index): tokenizer = tokenize.RegexpTokenizer(r'\w+') stemmer = stem.SnowballStemmer('russian') result = [] for sent in tokenize.sent_tokenize(comment): filtered = [word for word in tokenizer.tokenize(sent) \ if word not in corpus.stopwords.words('russian')] stemmed = [stemmer.stem(word) for word in filtered] for word in stemmed: if voc.get(word) == None: voc[word] = voc_index voc_index += 1 result += stemmed return voc_index, result
def clean_text(text): text = text.lower() # Lancaster is strictest and PorterStemmer is least strict. # Snowball is in the middle, so we use Snowball here for stemming text. stemmer = stem.SnowballStemmer('english') stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) not_punctuation = [ word for word in word_tokens if word not in string.punctuation ] cleaned = [word for word in not_punctuation if word not in stop_words] cleaned_stem = [stemmer.stem(word) for word in cleaned] cleaned_join = " ".join(cleaned_stem) return cleaned_join
def __call__(self, doc): tokens = tokenizer.word_tokenize(doc) tokens = [ token for token in tokens if token.isalnum() and len(token) > 0 and not token.isspace() ] if self.eliminate_stopwords: stop_words = stopwords.words("english") tokens = [token for token in tokens if token not in stop_words] if self.apply_stemming: snowball_stemmer = stemmer.SnowballStemmer("english") tokens = [snowball_stemmer.stem(token) for token in tokens] return tokens
def vectorize(input, stop_words, max_words): """ CREATE FEATURES BASED ON THE STEMMED UNIGRAM AND BIGRAM TFIDF VALUES input: list of text reviews stop_words: additional common words not used as features max_words: total number of features for classifier, typically 10,000 """ token = TfidfVectorizer().build_tokenizer() stemmer = stem.SnowballStemmer("english", ignore_stopwords=True) stopW = map(stemmer.stem, stopwords.words('english') + stop_words) def tstem(text): return map(stemmer.stem, token(text)) tfidf = TfidfVectorizer(max_features=max_words, ngram_range=(1, 2), stop_words=stopW, tokenizer=tstem) return tfidf.fit(input)
def preprocessing(email): # 1. 统一成小写 email = email.lower() # 2. 去除html标签 email = re.sub('<[^<>]>', ' ', email) # 3. 将网址替换为字符串 “httpaddr”. email = re.sub('(http|https)://[^\s]*', 'httpaddr', email) # 4. 将邮箱地址替换为 “emailaddr” email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email) # 5.所有dollar符号($)替换为“dollar”. email = re.sub('[\$]+', 'dollar', email) # 6.匹配数字,将数字替换为“number” email = re.sub('[0-9]+', 'number', email) # 匹配一个数字, 相当于 [0-9],+ 匹配1到多次 # 7. 词干提取 tokens = re.split( '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email) tokenlist = [] s = ns.SnowballStemmer('english') for token in tokens: # 8. 移除非文字类型 email = re.sub('[^a-zA-Z0-9]', '', email) stemmed = s.stem(token) # 9.去除空字符串‘’ if not len(token): continue tokenlist.append(stemmed) return tokenlist
def scrub_txt(dirty_txt): '''scrub text: lower, stop words, and stem Argument(s): dirty_txt (str): string to clean Return: clean_txt (str): clean string ''' # potential improvements: # - remove punctuation stemmer = stem.SnowballStemmer('english') eng_stopwords = set(stopwords.words('english')) # lower, stop, stem clean_txt = dirty_txt.lower() clean_txt = [ word for word in clean_txt.split() if word not in eng_stopwords ] clean_txt = " ".join([stemmer.stem(word) for word in clean_txt]) return clean_txt
def split_sentences2(text): return nltk.sent_tokenize(text) if __name__ == '__main__': w = "w" a = "a" print(w, end="") print() print(a) stemmer = stem.PorterStemmer() stemmer2 = stem.SnowballStemmer(language="english") print(stemmer.stem("colonization")) print(stemmer2.stem("colonization")) print(len(stopwords.words("english"))) lang = "tr" folderpath = "/home/dicle/Documents/data/tr/radikal_5class_newstexts/ekonomi" #instances = corpus_utils.read_n_files(folderpath, N=2) instances, labels = corpus_utils.get_20newsgroup() instances = instances[:2] for i, text in enumerate(instances): print(i, " Sentences:") print(split_sentences1(text)) print("####") print(split_sentences2(text)) print(
def language(self, value): self._language = value self.normalizer = stem.SnowballStemmer(self.language.lower())
def __init__(self, language='English'): self._language = language self.normalizer = stem.SnowballStemmer(self.language.lower())
def __init__(self): self.stemmer = stem.SnowballStemmer('english') self.dict = {}
def alignment_score(ppdbLines, word1, word2): stemmer = stem.SnowballStemmer("english") if (stemmer.stem(word1) == stemmer.stem(word2)): return MAX_ALIGNMENT_SCORE else: return _paraphrase_score(ppdbLines, word1, word2)
best_dist = d best_i = i print("Best post is %i with dist=%.2f" % (best_i, best_dist)) vectorizer = CountVectorizer( min_df=1, stop_words='english') #stopwords是停用词,如果定义了就会有很多类似于most啊、a啊、about啊不被统计 sorted(vectorizer.get_stop_words())[:50] # 大约有多少呢? 318个 len(vectorizer.get_stop_words()) # 同语义的词的去重,需要下载一个包....好像不用》。。。!!! # 正统的叫法是词干处理~ from nltk import stem english_stemmer = stem.SnowballStemmer('english') # 有很多,英语的用Snowball吧 english_stemmer.stem('imaging') english_stemmer.stem('image') english_stemmer.stem('imagine') english_stemmer.stem('buys') english_stemmer.stem('buying') english_stemmer.stem('bought') class StemmedCountVectorizer(CountVectorizer): # 创建一个类 def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) # 这个方法确实挺高级的感觉....
logger.debug("train size:" + str(train_data.shape) + " test size:" + str(eval_data.shape)) label = 'author' eval_id = eval_data['id'] # CREATE TARGET VARIABLE logger.debug("One hot encoding for label") train_data["EAP"] = (train_data.author == "EAP") * 1 train_data["HPL"] = (train_data.author == "HPL") * 1 train_data["MWS"] = (train_data.author == "MWS") * 1 target_vars = ["EAP", "HPL", "MWS"] Y_train = train_data[target_vars].values # STEMMING WORDS logger.debug("Sterm text ..") stemmer = stm.SnowballStemmer("english") stem_text = train_data.text.apply(lambda x: (" ").join( [stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]", " ", x).split(" ")])) eval_stem_text = eval_data.text.apply(lambda x: (" ").join( [stemmer.stem(z) for z in re.sub("[^a-zA-Z0-9]", " ", x).split(" ")])) all_sterm_text = pd.concat([stem_text, eval_stem_text]) logger.debug("Tokenizing text ..") # prepare tokenizer tokenizer = Tokenizer() tokenizer.fit_on_texts(all_sterm_text) vocab_size = len(tokenizer.word_index) + 1 logger.debug("vocab size:" + str(vocab_size)) # integer encode the documents encoded_text = tokenizer.texts_to_sequences(stem_text) # print(encoded_text[:3])
# In[125]: #lala = csv_data[['author_id', 'name', 'EXPERTISE', 'ins_name\r', 'URL']] #lala = lala.replace({'ins_name\r': {r'\r':''}}, regex = True) #lala.to_csv(r'C:\Users\admin\Desktop\professor_data.txt', header = None, index = None, sep = '\t', mode = 'a', encoding = 'utf-8') # In[126]: # In[127]: STOP_WORDS_FILENAME = r'C:\Users\admin\Desktop\stop_words_topic.txt' # In[128]: eng_stemmer = stem.SnowballStemmer('english') class Indexable(object): """Class representing an object that can be indexed. It is a general abstraction for indexable objects and can be used in different contexts. Args: iid (int): Identifier of indexable objects. metadata (str): Plain text with data to be indexed.
def stemming(words): stemmer = stem.SnowballStemmer('english') words = [stemmer.stem(word) for word in words] return words