def _create_stemmer(stemmer_type): """ Initialize a stemmer """ return { 'Porter': PorterStemmer(), 'Snowball': SnowballStemmer('english'), 'Lancaster': LancasterStemmer(), }[stemmer_type]
def process(word_list): lancaster=LancasterStemmer() new_list=[] for word in word_list: w=lancaster.stem(word) new_list.append(w) return new_list
def words_stemmer(words, type="PorterStemmer", lang="english", encoding="utf8"): supported_stemmers = [ "PorterStemmer", "LancasterStemmer", "SnowballStemmer" ] if type is False or type not in supported_stemmers: return words else: stem_words = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words: stem_words.append(stemmer.stem(word).encode(encoding)) return " ".join(stem_words)
def Stem(s): if s is not None and isinstance(s, str) and len(s) > 0: stemmer = LancasterStemmer() s = (" ").join([stemmer.stem(z) for z in s.split(" ")]) s = s.lower() return s else: return ""
def stem_words(self, words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = "" for word in words.split(" "): stem = stemmer.stem(word) stems = stems + " " + stem return stems
def stem_words(words): """Stem words in list of tokenized words""" stemmer = LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def main(): save_data_from_webpage() text = get_data_from_file() #creates a list of the tolkenized words tt = word_tokenize(text) pprint(tt) #creates a new list for the steam words using all of the stemmers psteam = PorterStemmer() psteam_list = [] for word in tt: psteam_list.append(psteam.stem(word)) pprint(psteam_list) lsteam = LancasterStemmer() lsteam_list = [] for word in tt: lsteam_list.append(lsteam.stem(word)) pprint(lsteam_list) ssteam = SnowballStemmer() ssteam_list = [] for word in tt: ssteam_list.append(ssteam.stem(word)) pprint(ssteam_list) p = set(psteam_list) l = set(lsteam_list) s = set(ssteam_list) #displays the different steams pprint(s.difference(l.difference(p))) #pos taging pos_list = pos_tag(text) pprint(pos_list) #creates a new list for the lematized words lemmatizer = WordNetLemmatizer() lem = [] for word in tt: lem.append(lemmatizer.lemmatize(word)) #pprint(lem) # returns a generator of trigrams using the tokenized list tt trig = trigrams(tt) displays the results print(list(trig)) #ne_chunck finds non overlapping groups #pos_tag ids how the text is used in speech NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text))) print(NamedEntity)
def _normalize(self, item): key, value = item ls = LancasterStemmer() text = word_tokenize(value[0]) text = [word.lower() for word in text] text = [ ls.stem(word).rstrip('s') for word in text if word not in stopwords.words('english') and word.isalnum() ] return (key, (text, value[1]))
def __stem_document(document_name: pathlib.Path) -> list: stemmer = LancasterStemmer() with document_name.open('r', encoding='utf-8') as document: lines = document.readlines() result = [] for line in lines: line = line.strip() words = [token for token in line.split(' ')] words = [stemmer.stem(word) for word in words] sentence = ' '.join(words) result.append(sentence) return result
def get_stems(tokens): stemmer = LancasterStemmer() stemmed_tokens = [] for token in tokens: for word in token: if word[1] == 'DT' or word[1] == 'PRP' or word[1] == 'PRP$' or word[ 1] == 'NN' or word[1] == 'NNP' or word[1] == 'NNPS': temp_tokens = word[0] else: temp_tokens = stemmer.stem(word[0]) stemmed_tokens.append(temp_tokens) return get_lemma(stemmed_tokens)
def getStemsFromURL(page_url): ''' Given the link of a webpage (string), returns a list of all the words' stems in the webpage text ''' with urlopen(page_url) as infile: soup = BeautifulSoup(infile, features="lxml") ls = LancasterStemmer() words = word_tokenize(soup.text) words = [w.lower() for w in words] words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()] return words
def clean_tweets(self, text): st = LancasterStemmer() #st = PorterStemmer() with open('newspaper3k/SmartStoplist.txt', 'r') as f: stopwords = [line.strip() for line in f] # remove URL's text = re.sub(r'http\S+', '', text) tweet_tmp = text.split("\n") for k in tweet_tmp: tweet_tmp = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower() tweet_tmp = st.stem(tweet_tmp) tweet_tmp = ''.join([i for i in tweet_tmp if not i.isdigit()]) tweet_tmp = tweet_tmp.split() result = [word for word in tweet_tmp if word not in stopwords] return result
def checkstemmers(): raw = customparse( "C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt" ) wordz = raw.split(" ") O = ["sweating", "tripping", "gunning", "going"] HH = [i[0:-1] for i in O] dic = enchant.Dict("en_US") from nltk import LancasterStemmer, PorterStemmer lancaster = LancasterStemmer() porter = PorterStemmer() resporter = [porter.stem(t).replace(" ", "") for t in wordz] reslan = [lancaster.stem(t).replace(" ", "") for t in wordz] resall = [[wordz[i], resporter[i], reslan[i]] for i in range(len(wordz))] filtres = [ resall[i] for i in range(len(resall)) if not (resall[i][0] == resall[i][2] == resall[i][1]) ] return resall
def getMostUsedWordsTxt(file, wordnum): ''' Given a text file name (string) and the number of most used words we want to find (int), returns a list of the wordnum most common elements and their counts from the most common to the least: [('1st_most_common_word', count1), ('2nd_most_common_word', count2), ..., ('wordnumth_most_common_word', countwordnum)] ''' with open(file, "r") as f: words = f.read() words = words.split() ls = LancasterStemmer() words = [w.lower() for w in words] words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()] freqs = Counter(words) return freqs.most_common(wordnum)
def tokenize(self, description): filtered = [] # dont process NaN or Null values if pd.isnull(description): return filtered, filtered else: terms = description.lower().split() # terms = word_tokenize(description.lower().decode('utf-8')) filtered_stopwords = [word for word in terms if not word in stopwords.words('english')] # # Stemming Snowball # stemmer = SnowballStemmer('english') # for stem in filtered_stopwords: # filtered.append(stemmer.stem(stem.decode('utf-8'))) # # Stemming Porter # stemmer = PorterStemmer() # for stem in filtered_stopwords: # filtered.append(stemmer.stem(stem.decode('utf-8'))) # Lemmatizer Word Net Lemmatizer lemmatizer = WordNetLemmatizer() for lemmatized in filtered_stopwords: filtered.append(lemmatizer.lemmatize(lemmatized)) filtered_final = [] # Stemming Lancaster stemmer = LancasterStemmer() for stem in filtered: # filtered_final.append(stemmer.stem(stem.decode('utf-8'))) filtered_final.append(stemmer.stem(stem)) # # Lemmatizer TextBlob # for lemmatized in filtered_stopwords: # w = Word(lemmatized.decode('utf-8')) # filtered.append(w.lemmatize) return filtered_final
def get_words_from_string(string): string = string.lower() word_pattern = r'[A-Za-z]+' # link_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})" # email_pattern = r"\S+@\S+" # ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b" result = [] # for x in re.findall(link_pattern, string): # try: # url = "{0.scheme}://{0.netloc}/".format(urlsplit(x)) # except: # url = x # result.append(url) # string = re.sub(link_pattern, "", string) # result.extend(re.findall(email_pattern, string)) # string = re.sub(email_pattern, "", string) # result.extend(re.findall(ip_pattern, string)) # string = re.sub(ip_pattern, "", string) # stemmer = PorterStemmer() stemmer = LancasterStemmer() result.extend( [stemmer.stem(word) for word in re.findall(word_pattern, string)]) # result.extend(re.findall(word_pattern, string)) return result
from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize import math # untuk operasi matematika lanjutan app = Flask(__name__) # ----------KONFIGURASI DATABASE DOKUMEN---------- db = mysql.connector.connect(host="localhost", user="******", passwd="", database="stki") cursor = db.cursor() # buat variabel tempat stopwords stop_words = set(stopwords.words('english')) lancaster = LancasterStemmer() # Lancaster/Paice-Husk Stemmer eliminasi = [ '.', '?', '!', ' ', ',', ':', ';', '(', ')', '\'', '"', '%', '&', '*', '-', '_', '+', '=', '{', '}', '[', ']', '\\', '|', '"', '<', '>', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '�' ] def preProcessDoc(docs): docs_token = word_tokenize(docs) arr = [] for i in range(len(docs_token)): docs_token[i] = docs_token[i].lower() if docs_token[i] not in stop_words: skip = 0 for j in range(len(docs_token[i])):
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/7/11 17:50 # @Author : 代登辉 # @Email : [email protected] # @File : stemmers.py # @Software : PyCharm # @Description: 词干提取 from nltk import PorterStemmer, LancasterStemmer, word_tokenize raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \ "loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And " \ "I will have my vengeance, in this life or the next. " tokens = word_tokenize(raw) # 根据单词分词 porter = PorterStemmer() # 相对少去后缀 pStems = [porter.stem(t) for t in tokens] # 后缀(s es e ed al) print(pStems) lancaster = LancasterStemmer() # 更彻底 lStems = [lancaster.stem(t) for t in tokens] # 去除单词的大小写和后缀 print(lStems)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Python for AHDA. Part 5, Example 7. """ # Stemming words - test your tools from nltk import LancasterStemmer from nltk import PorterStemmer print('LancasterStemmer') print(LancasterStemmer().stem('nation')) print(LancasterStemmer().stem('nationality')) print(LancasterStemmer().stem('nationally')) print(LancasterStemmer().stem('natural')) print(LancasterStemmer().stem('naturally')) print(LancasterStemmer().stem('nature')) print() print('PorterStemmer') print(PorterStemmer().stem('nation')) print(PorterStemmer().stem('nationality')) print(PorterStemmer().stem('nationally')) print(PorterStemmer().stem('natural')) print(PorterStemmer().stem('naturally')) print(PorterStemmer().stem('nature'))
'page': TITLE, 'format': "json" } R = S.get(url=URL, params=PARAMS) DATA = R.json() # get the text wiki_page_text = DATA["parse"]["text"]["*"] h = html2text.HTML2Text() h.ignore_links = True page_text = h.handle(wiki_page_text) # create a new stemmer ls = LancasterStemmer() # tokenize text words = nltk.word_tokenize(page_text) words = [w.lower() for w in words] # eliminate stop words and stem the rest of the words words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()] freqs = Counter(words) print("The 10 most frequently used stems in the ''Data science'' Wikipedia page are:") for word, count in freqs.most_common(10): print(word, count)
import re import logging from nltk import WordNetLemmatizer, LancasterStemmer from django.core.urlresolvers import reverse logger = logging.getLogger(__name__) wordnet_lemmatizer = WordNetLemmatizer() lancaster_stemmer = LancasterStemmer() def extract_keywords(title): original_keywords = [keyword.lower() for keyword in re.split('\W+', title)] try: lemmatized_keywords = map(wordnet_lemmatizer.lemmatize, original_keywords) except LookupError: logging.error('Please install corpora/wordnet dictionary') return [] stemmed_keywords = map(lancaster_stemmer.stem, original_keywords) return list(set(original_keywords + lemmatized_keywords + stemmed_keywords)) def reverse_tastypie_url(resource_name, pk=None): """ Returns tastypie url
def stem(array, word): stemmed = LancasterStemmer().stem(word) array.remove(word) array.append(stemmed)
print(quotes_quadgrams) # stemming from nltk import PorterStemmer pst = PorterStemmer() pst.stem("having") pst.stem("sudeep") words_stem = ["give", "giving", "given", "gave"] for words in words_stem: print(words + " :" + pst.stem(words)) from nltk import LancasterStemmer lnst = LancasterStemmer() for words in words_stem: print(words + " :" + lnst.stem(words)) from nltk import SnowballStemmer snl = SnowballStemmer("english") for words in words_stem: print(words + " :" + snl.stem(words)) # lemmetizing from nltk import WordNetLemmatizer wordnet = WordNetLemmatizer() for words in words_stem:
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer from nltk.stem import WordNetLemmatizer from nltk import ngrams pStemmer = PorterStemmer() lStemmer = LancasterStemmer() sStemmer = SnowballStemmer('english') lemmetizer = WordNetLemmatizer() def stem_each_word(tokens, lancaster_file, porter_file, snowball_file, lemmetizer_file, trigrams_file): lancaster_file_out = open(lancaster_file, "a+") porter_file_out = open(porter_file, "a+") snowball_file_out = open(snowball_file, "a+") lemmetizer_file_out = open(lemmetizer_file, "a+") trigrams_file_out = open(trigrams_file, "a+") for token in tokens: porter_file_out.write(str(pStemmer.stem(token)) + "\t") lancaster_file_out.write(str(lStemmer.stem(token)) + "\t") snowball_file_out.write(str(sStemmer.stem(token)) + "\t") lemmetizer_file_out.write(str(lemmetizer.lemmatize(token)) + "\t") trigrams_file_out.write(str(list(ngrams(tokens, 3)))) porter_file_out.write("\n") lancaster_file_out.write("\n") snowball_file_out.write("\n") lemmetizer_file_out.write("\n") trigrams_file_out.write("\n")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Python for AHDA. Part 5, Example 8. """ # Lemmatize words from nltk import LancasterStemmer from nltk import PorterStemmer print('LancasterStemmer') print(LancasterStemmer().stem('lying')) print(LancasterStemmer().stem('lie')) print() print('PorterStemmer') print(PorterStemmer().stem('lying')) print(PorterStemmer().stem('lie'))