def _create_stemmer(stemmer_type):
    """ Initialize a stemmer """
    return {
        'Porter': PorterStemmer(),
        'Snowball': SnowballStemmer('english'),
        'Lancaster': LancasterStemmer(),
    }[stemmer_type]
Exemplo n.º 2
0
def process(word_list):
	lancaster=LancasterStemmer()
	new_list=[]
	for word in word_list:
		w=lancaster.stem(word)
		new_list.append(w)
	return new_list
Exemplo n.º 3
0
def words_stemmer(words,
                  type="PorterStemmer",
                  lang="english",
                  encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "LancasterStemmer", "SnowballStemmer"
    ]
    if type is False or type not in supported_stemmers:
        return words
    else:
        stem_words = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))

        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))

        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words:
                stem_words.append(stemmer.stem(word).encode(encoding))
        return " ".join(stem_words)
Exemplo n.º 4
0
def Stem(s):
    if s is not None and isinstance(s, str) and len(s) > 0:
        stemmer = LancasterStemmer()
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s = s.lower()
        return s
    else:
        return ""
Exemplo n.º 5
0
 def stem_words(self, words):
     """Stem words in list of tokenized words"""
     stemmer = LancasterStemmer()
     stems = ""
     for word in words.split(" "):
         stem = stemmer.stem(word)
         stems = stems + " " + stem
     return stems
Exemplo n.º 6
0
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
Exemplo n.º 7
0
def main():
    save_data_from_webpage()
    
    text = get_data_from_file()
  
    
    #creates a list of the tolkenized words
    tt = word_tokenize(text)
    pprint(tt)

    #creates a new list for the steam words using all of the stemmers
    psteam = PorterStemmer()
    psteam_list = []
    for word in tt:
        psteam_list.append(psteam.stem(word))
    pprint(psteam_list)

    lsteam = LancasterStemmer()
    lsteam_list = []
    for word in tt:
       lsteam_list.append(lsteam.stem(word))
    pprint(lsteam_list)

    ssteam = SnowballStemmer()
    ssteam_list = []
    for word in tt:
        ssteam_list.append(ssteam.stem(word))
    pprint(ssteam_list)

    p = set(psteam_list)
    l = set(lsteam_list)
    s = set(ssteam_list)
    #displays the different steams
    pprint(s.difference(l.difference(p)))

    #pos taging
    pos_list = pos_tag(text)
    pprint(pos_list)

    #creates a new list for the lematized words
    lemmatizer = WordNetLemmatizer()
    lem = []
    for word in tt:
        lem.append(lemmatizer.lemmatize(word)) 
    #pprint(lem)
    
    # returns a generator of trigrams using the tokenized list tt
    trig = trigrams(tt)
    displays the results
    print(list(trig))
    
    #ne_chunck finds non overlapping groups
    #pos_tag ids how the text is used in speech
    NamedEntity = ne_chunk(pos_tag(wordpunct_tokenize(text)))
    print(NamedEntity)
Exemplo n.º 8
0
 def _normalize(self, item):
     key, value = item
     ls = LancasterStemmer()
     text = word_tokenize(value[0])
     text = [word.lower() for word in text]
     text = [
         ls.stem(word).rstrip('s')
         for word in text
         if word not in stopwords.words('english') and word.isalnum()
     ]
     return (key, (text, value[1]))
Exemplo n.º 9
0
def __stem_document(document_name: pathlib.Path) -> list:
    stemmer = LancasterStemmer()
    with document_name.open('r', encoding='utf-8') as document:
        lines = document.readlines()
    result = []
    for line in lines:
        line = line.strip()
        words = [token for token in line.split(' ')]
        words = [stemmer.stem(word) for word in words]
        sentence = ' '.join(words)
        result.append(sentence)
    return result
Exemplo n.º 10
0
def get_stems(tokens):
    stemmer = LancasterStemmer()
    stemmed_tokens = []
    for token in tokens:
        for word in token:
            if word[1] == 'DT' or word[1] == 'PRP' or word[1] == 'PRP$' or word[
                    1] == 'NN' or word[1] == 'NNP' or word[1] == 'NNPS':
                temp_tokens = word[0]
            else:
                temp_tokens = stemmer.stem(word[0])
            stemmed_tokens.append(temp_tokens)
    return get_lemma(stemmed_tokens)
Exemplo n.º 11
0
def getStemsFromURL(page_url):
    '''
    Given the link of a webpage (string), returns a list of 
    all the words' stems in the webpage text
    '''
    with urlopen(page_url) as infile:
        soup = BeautifulSoup(infile, features="lxml")

    ls = LancasterStemmer()
    words = word_tokenize(soup.text)
    words = [w.lower() for w in words]
    words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()]
    return words
Exemplo n.º 12
0
    def clean_tweets(self, text):
        st = LancasterStemmer()
        #st = PorterStemmer()
        with open('newspaper3k/SmartStoplist.txt', 'r') as f:
            stopwords = [line.strip() for line in f]

            # remove URL's
            text = re.sub(r'http\S+', '', text)
            tweet_tmp = text.split("\n")
            for k in tweet_tmp:
                tweet_tmp = re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower()
                tweet_tmp = st.stem(tweet_tmp)
            tweet_tmp = ''.join([i for i in tweet_tmp if not i.isdigit()])
            tweet_tmp = tweet_tmp.split()
            result = [word for word in tweet_tmp if word not in stopwords]
            return result
Exemplo n.º 13
0
def checkstemmers():
    raw = customparse(
        "C://cygwin//home//nelson auner//Pontikes//FinalData.OctNewKeepAndAnonymous/capsavem/my_cape/outtoget.cap.txt"
    )
    wordz = raw.split(" ")
    O = ["sweating", "tripping", "gunning", "going"]
    HH = [i[0:-1] for i in O]
    dic = enchant.Dict("en_US")
    from nltk import LancasterStemmer, PorterStemmer
    lancaster = LancasterStemmer()
    porter = PorterStemmer()
    resporter = [porter.stem(t).replace(" ", "") for t in wordz]
    reslan = [lancaster.stem(t).replace(" ", "") for t in wordz]
    resall = [[wordz[i], resporter[i], reslan[i]] for i in range(len(wordz))]
    filtres = [
        resall[i] for i in range(len(resall))
        if not (resall[i][0] == resall[i][2] == resall[i][1])
    ]
    return resall
Exemplo n.º 14
0
def getMostUsedWordsTxt(file, wordnum):
    '''
    Given a text file name (string) and the number of most
    used words we want to find (int), returns a list of the wordnum
    most common elements and their counts from the most common
    to the least:
    [('1st_most_common_word', count1), 
    ('2nd_most_common_word', count2), 
    ...,
    ('wordnumth_most_common_word', countwordnum)]
    '''
    with open(file, "r") as f:
        words = f.read()
        words = words.split()

    ls = LancasterStemmer()
    words = [w.lower() for w in words]
    words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalpha()]
    freqs = Counter(words)
    return freqs.most_common(wordnum)
Exemplo n.º 15
0
    def tokenize(self, description):

        filtered = []
        # dont process NaN or Null values
        if pd.isnull(description):
            return filtered, filtered
        else:
            terms = description.lower().split()
            # terms = word_tokenize(description.lower().decode('utf-8'))
            filtered_stopwords = [word for word in terms if not word in stopwords.words('english')]

            # # Stemming Snowball
            # stemmer = SnowballStemmer('english')
            # for stem in filtered_stopwords:
            #     filtered.append(stemmer.stem(stem.decode('utf-8')))

            # # Stemming Porter
            # stemmer = PorterStemmer()
            # for stem in filtered_stopwords:
            #     filtered.append(stemmer.stem(stem.decode('utf-8')))

            # Lemmatizer Word Net Lemmatizer
            lemmatizer = WordNetLemmatizer()
            for lemmatized in filtered_stopwords:
                filtered.append(lemmatizer.lemmatize(lemmatized))

            filtered_final = []
            # Stemming Lancaster
            stemmer = LancasterStemmer()
            for stem in filtered:
                # filtered_final.append(stemmer.stem(stem.decode('utf-8')))
                filtered_final.append(stemmer.stem(stem))

            # # Lemmatizer TextBlob
            # for lemmatized in filtered_stopwords:
            #     w = Word(lemmatized.decode('utf-8'))
            #     filtered.append(w.lemmatize)

            return filtered_final
def get_words_from_string(string):
    string = string.lower()
    word_pattern = r'[A-Za-z]+'
    # link_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})"
    # email_pattern = r"\S+@\S+"
    # ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
    result = []
    # for x in re.findall(link_pattern, string):
    #     try:
    #         url = "{0.scheme}://{0.netloc}/".format(urlsplit(x))
    #     except:
    #         url = x
    #     result.append(url)
    # string = re.sub(link_pattern, "", string)
    # result.extend(re.findall(email_pattern, string))
    # string = re.sub(email_pattern, "", string)
    # result.extend(re.findall(ip_pattern, string))
    # string = re.sub(ip_pattern, "", string)
    # stemmer = PorterStemmer()
    stemmer = LancasterStemmer()
    result.extend(
        [stemmer.stem(word) for word in re.findall(word_pattern, string)])
    # result.extend(re.findall(word_pattern, string))
    return result
Exemplo n.º 17
0
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import math  # untuk operasi matematika lanjutan

app = Flask(__name__)

# ----------KONFIGURASI DATABASE DOKUMEN----------
db = mysql.connector.connect(host="localhost",
                             user="******",
                             passwd="",
                             database="stki")
cursor = db.cursor()

# buat variabel tempat stopwords
stop_words = set(stopwords.words('english'))
lancaster = LancasterStemmer()  # Lancaster/Paice-Husk Stemmer
eliminasi = [
    '.', '?', '!', ' ', ',', ':', ';', '(', ')', '\'', '"', '%', '&', '*', '-',
    '_', '+', '=', '{', '}', '[', ']', '\\', '|', '"', '<', '>', '/', '0', '1',
    '2', '3', '4', '5', '6', '7', '8', '9', '�'
]


def preProcessDoc(docs):
    docs_token = word_tokenize(docs)
    arr = []
    for i in range(len(docs_token)):
        docs_token[i] = docs_token[i].lower()
        if docs_token[i] not in stop_words:
            skip = 0
            for j in range(len(docs_token[i])):
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/7/11 17:50
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : stemmers.py
# @Software   : PyCharm
# @Description: 词干提取

from nltk import PorterStemmer, LancasterStemmer, word_tokenize

raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and " \
      "loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And " \
      "I will have my vengeance, in this life or the next. "
tokens = word_tokenize(raw)  # 根据单词分词
porter = PorterStemmer()  # 相对少去后缀
pStems = [porter.stem(t) for t in tokens]  # 后缀(s es e ed al)
print(pStems)

lancaster = LancasterStemmer()  # 更彻底
lStems = [lancaster.stem(t) for t in tokens]  # 去除单词的大小写和后缀
print(lStems)
Exemplo n.º 19
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Python for AHDA.

Part 5, Example 7.
"""

# Stemming words - test your tools

from nltk import LancasterStemmer
from nltk import PorterStemmer

print('LancasterStemmer')
print(LancasterStemmer().stem('nation'))
print(LancasterStemmer().stem('nationality'))
print(LancasterStemmer().stem('nationally'))
print(LancasterStemmer().stem('natural'))
print(LancasterStemmer().stem('naturally'))
print(LancasterStemmer().stem('nature'))
print()
print('PorterStemmer')
print(PorterStemmer().stem('nation'))
print(PorterStemmer().stem('nationality'))
print(PorterStemmer().stem('nationally'))
print(PorterStemmer().stem('natural'))
print(PorterStemmer().stem('naturally'))
print(PorterStemmer().stem('nature'))
Exemplo n.º 20
0
    'page': TITLE,
    'format': "json"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

# get the text
wiki_page_text = DATA["parse"]["text"]["*"]

h = html2text.HTML2Text()
h.ignore_links = True
page_text = h.handle(wiki_page_text)

# create a new stemmer
ls = LancasterStemmer()

# tokenize text
words = nltk.word_tokenize(page_text)

words = [w.lower() for w in words]

# eliminate stop words and stem the rest of the words
words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]

freqs = Counter(words)

print("The 10 most frequently used stems in the ''Data science'' Wikipedia page are:")
for word, count in freqs.most_common(10):
    print(word, count)
Exemplo n.º 21
0
import re
import logging

from nltk import WordNetLemmatizer, LancasterStemmer

from django.core.urlresolvers import reverse

logger = logging.getLogger(__name__)
wordnet_lemmatizer = WordNetLemmatizer()
lancaster_stemmer = LancasterStemmer()


def extract_keywords(title):
    original_keywords = [keyword.lower() for keyword in re.split('\W+', title)]

    try:
        lemmatized_keywords = map(wordnet_lemmatizer.lemmatize,
                                  original_keywords)
    except LookupError:
        logging.error('Please install corpora/wordnet dictionary')
        return []

    stemmed_keywords = map(lancaster_stemmer.stem, original_keywords)

    return list(set(original_keywords + lemmatized_keywords +
                    stemmed_keywords))


def reverse_tastypie_url(resource_name, pk=None):
    """
    Returns tastypie url
Exemplo n.º 22
0
def stem(array, word):
    stemmed = LancasterStemmer().stem(word)
    array.remove(word)
    array.append(stemmed)
Exemplo n.º 23
0
print(quotes_quadgrams)

# stemming
from nltk import PorterStemmer

pst = PorterStemmer()
pst.stem("having")
pst.stem("sudeep")

words_stem = ["give", "giving", "given", "gave"]
for words in words_stem:
    print(words + " :" + pst.stem(words))

from nltk import LancasterStemmer

lnst = LancasterStemmer()
for words in words_stem:
    print(words + " :" + lnst.stem(words))

from nltk import SnowballStemmer

snl = SnowballStemmer("english")
for words in words_stem:
    print(words + " :" + snl.stem(words))

# lemmetizing

from nltk import WordNetLemmatizer
wordnet = WordNetLemmatizer()

for words in words_stem:
Exemplo n.º 24
0
from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

pStemmer = PorterStemmer()
lStemmer = LancasterStemmer()
sStemmer = SnowballStemmer('english')
lemmetizer = WordNetLemmatizer()


def stem_each_word(tokens, lancaster_file, porter_file, snowball_file,
                   lemmetizer_file, trigrams_file):
    lancaster_file_out = open(lancaster_file, "a+")
    porter_file_out = open(porter_file, "a+")
    snowball_file_out = open(snowball_file, "a+")
    lemmetizer_file_out = open(lemmetizer_file, "a+")
    trigrams_file_out = open(trigrams_file, "a+")
    for token in tokens:
        porter_file_out.write(str(pStemmer.stem(token)) + "\t")
        lancaster_file_out.write(str(lStemmer.stem(token)) + "\t")
        snowball_file_out.write(str(sStemmer.stem(token)) + "\t")
        lemmetizer_file_out.write(str(lemmetizer.lemmatize(token)) + "\t")
    trigrams_file_out.write(str(list(ngrams(tokens, 3))))
    porter_file_out.write("\n")
    lancaster_file_out.write("\n")
    snowball_file_out.write("\n")
    lemmetizer_file_out.write("\n")
    trigrams_file_out.write("\n")
Exemplo n.º 25
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Python for AHDA.

Part 5, Example 8.
"""

# Lemmatize words

from nltk import LancasterStemmer
from nltk import PorterStemmer

print('LancasterStemmer')
print(LancasterStemmer().stem('lying'))
print(LancasterStemmer().stem('lie'))
print()
print('PorterStemmer')
print(PorterStemmer().stem('lying'))
print(PorterStemmer().stem('lie'))