Пример #1
0
 def tokenise(self, stem: bool = False) -> List[str]:
     words = word_tokenize(self.content)
     if stem:
         stemmer = RegexpStemmer('ing$|s$|ed$|y$|er$|[^aeiou]{1}y$|e$',
                                 min=3)
         words = [stemmer.stem(word) for word in words]
     return words
        def f(s):
            if s is not None:
                line = s.lower().replace('"', ']').replace('\'', ' ')			# converting words in lowercase
                tokenized_words = word_tokenize(line)					# tokenizing

                regexFile="regex.txt"
                Snowballstemmer=SnowballStemmer("english")

                RegexStemmer=[]                                #Stemmer for Regular expression
                with open(regexFile,'r') as regFile: 
                    while True:
                        line = regFile.readline()
                        print(line)
                        if not line:
                            break
                        RegexStemmer=RegexpStemmer(line,min=2)

                data =filter(lambda x: x not in stopwords, tokenized_words)		# data=[tokenized_words - nouse_words]
                lmtzr = WordNetLemmatizer()
                list_of_words=[]
                for item in data:
                    if len(item)>2:							# words with length <=2 are removed
                        #rlemma=lmtzr.lemmatize(item)				# lemmatizing				
                        # stemming
                        x=RegexStemmer.stem(item)
                        #x=Snowballstemmer.stem(regx)

                        if len(x)>2:
                            list_of_words.append(x)					# adding item to list_of_words

                t = ' '.join(str(item) for item in list_of_words)
                return t
Пример #3
0
def my_stem(word):
    st = RegexpStemmer('ness$|ity$|ment', min=4)
    if word.endswith('acy'):
        stem = word[:-2]
        stem += 'te'
    elif word.endswith('cy'):
        stem = word[:-2]
        stem += 't'

    elif word.endswith('ility'):
        stem = word[:-5]
        stem += 'le'
        if stem not in model.vocab:
            stem = word[:-3]

    # elif word.endswith('ality'):
    #     stem = word[:-5]
    #     if stem not in model.vocab:
    #         stem = word[:-3]

    elif word.endswith('ce'):
        stem = word[:-2]
        stem += 't'

    else:
        stem = st.stem(word)
        if stem.endswith('i'):
            stem = stem[:-1] + 'y'
    return stem
Пример #4
0
def stemming(word):
    # Use stemmers for removing morphological affixes from words.
    # Portst = PorterStemmer()
    # Landst = LancasterStemmer()
    Regst = RegexpStemmer('ing|ed')
    new = Regst.stem(word)
    return new
Пример #5
0
def cleanText( raw_text ):
    text = raw_text

    # replace non-alpha characters
    text = re.sub( '[^a-z\s]+','', text, flags=re.IGNORECASE )

    # replace multiple spaces with a single one
    text = re.sub('(\s+)',' ', text )

    # converting string to lower case
    text = text.lower()

    # regex to remove punctuation
    tokenizer =  RegexpTokenizer( r'\w+' )

    # initial tokenization
    tokenized_text = tokenizer.tokenize( text )
    # stemmer to remove plurals
    stemmer = RegexpStemmer( 's$|ies$' )

    # remove stop words
    stop_words = set(['whom', 'that', 'those', "needn't", 'where', 'has', 'same', 'had', 'we', 'my', 'hers', 'does', 'they', 'the', 'only', "doesn't", 'be', 'mightn', 'her', 'wasn', 'being', 'am', 'but', 'themselves', 'during', "don't", 'into', 'its', 'isn', 'of', 'won', 'few', 'as', 'own', 'more', "shouldn't", 'myself', "mightn't", 'after', 'below', "didn't", "you've", 'wouldn', 'any', 'his', 'in', 'hasn', "weren't", 'him', 'she', 'will', "won't", 'it', 'y', 'he', 'now', 'such', 'haven', 'most', 'who', 'an', 'shan', 'at', "she's", 'were', 'weren', 'do', 'did', 've', 'all', 'between', 'above', "you're", 'no', "you'll", 'which', 'i',
'been', 'doesn', "hasn't", 'each', 'some', 'don', "aren't", 'should', 'mustn', 'our', "wouldn't", 'their', 'your', 'yours', 'doing', 'why', "hadn't", 'down', 'so', 'for', 'while', 'this', "shan't", 'there', 'needn', 'up', 'shouldn', 'by', "mustn't", 'have', 'yourself', "you'd", 'd', "haven't", 'about', 'ain', 'or', 'ourselves', 'when', "couldn't", 'is', 'with', "that'll", 'these', 'further', "should've", 'if', 'than', 'just', "wasn't", 'other', "isn't", 'you',
'then', 'how', 'too', 'until', 'very', 'are', 'to', 'itself', 'aren', 't', 'a', 'before', 'm', 'can', 'out', 'and', 'under', 'here', 'o', 'on', 'theirs', 'ma', 'couldn', 'having', 'himself', 'against', 'again', 'll', 'nor', 'hadn', 'ours', 'through', 'both', 'because', 'what', 's', 'them', 'not', 'off', 'me', "it's", 'once', 'over', 'didn', 'was', 're', 'from', 'yourselves', 'herself'])
    clean_text = []
    for word in tokenized_text:
        if word not in stop_words:
            # make plurals singular
            token = stemmer.stem( word )
            clean_text.append( token )

    return clean_text
Пример #6
0
def stem_words(text):
    words = word_tokenize(text)
    #Regex for Suffixes
    st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4)
    stemmed = []
    for word in words:
        stemmed.append(st.stem(word))
    return ' '.join(stemmed)
Пример #7
0
 def __init__(self, language):
     self.language = language
     if self.language == "eng":
         self.model = WordNetLemmatizer()
     elif self.language == "nso":
         self.model = RegexpStemmer('ng$', min=4)
     else:
         self.model = None
Пример #8
0
def stemming(word):
    # Use stemmers for removing morphological affixes from words.
    Portst = PorterStemmer()
    Landst = LancasterStemmer()
    Regst = RegexpStemmer('ing|ed')
    new = Portst.stem(word)
    if new == word:
        new = Landst.stem(word)
        if new == word:
            new = Regst.stem(word)
    return new
Пример #9
0
    def __init__(self, db):
        super().__init__(db)
        self.nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')

        # Remove affixes from a word: it's -> it, we'll -> we
        stemmer_pattern = r"’s$|n’t$|’ll$|’re$|’ve$|’d$|’m$|'s$"
        stemmer_pattern += r"|n't$|'ll$|'re$|'ve$|'d$|'m$|"
        self.stemmer = RegexpStemmer(stemmer_pattern)

        # Part-of-speech tagger
        self.tagger = nltk.tag.pos_tag
        self.wordnetlemmatize = WordNetLemmatizer()

        self._stop_words = None
        self._junk_symbols = None
        self._proper_nouns = None
Пример #10
0
def analyze(text, stop, stem, wstem):
    # Set utilities
    if stop:
        stopeng = set(stopwords.words('english'))
    if wstem:
        stemmer = RegexpStemmer('ing$|s$|e$', min=4)
    if stem:
        stemmer = PorterStemmer()
    tok = RegexpTokenizer(r'\w+')
    # Remove weird characters
    text = stripSpecial(text)
    # Tokenize and lowercase
    text = tok.tokenize(text.lower())
    # Remove stopwords if flagged
    if stop:
        text = [w for w in text if w not in stopeng]
    # Stem if flagged
    if (stem or wstem):
        text = [stemmer.stem(w) for w in text]
    return ' '.join(text)
Пример #11
0
class word_lemmatiser:
    def __init__(self, language):
        self.language = language
        if self.language == "eng":
            self.model = WordNetLemmatizer()
        elif self.language == "nso":
            self.model = RegexpStemmer('ng$', min=4)
        else:
            self.model = None

    def lemma(self, x):
        if self.language == "eng":
            return self.model.lemmatize(x[0])
        elif self.language == "nso":
            return self.model.stem(x[0].lower())
        elif self.language == "zul":
            return x[2]
        else:
            return x[0]

    def identity(self, word):
        return word
Пример #12
0
    def word_refiner(*args):
        Portst = PorterStemmer()
        Landst = LancasterStemmer()
        Regst = RegexpStemmer('ing|ed|ly|lly')
        args = [i for i in args if isinstance(i, unicode)]

        for w in map(str, args):
            if w in dic1:
                yield w
            else:
                st1 = Portst.stem(w)
                if st1 in dic1:
                    yield st1
                else:
                    st2 = Landst.stem(w)
                    if st2 in dic1:
                        yield st2
                    else:
                        st3 = Regst.stem(w)
                        if st3 in dic1:
                            yield st3
                        else:
                            yield w
Пример #13
0
def normalize(sentences, stem_type):
    G = nx.DiGraph()

    # Create stemmer object of the type specified by stem_type
    stemmers = {
        '-p': PorterStemmer(),
        '-l': LancasterStemmer(),
        '-s': SnowballStemmer('english'),
        '-w': WordNetLemmatizer(),
        '-r': RegexpStemmer('ing$|s$|e$|able$', min=4)
    }

    try:
        stemmer = stemmers[stem_type]
    except KeyError:
        print('\nInvalid stemmer type passed as argument.\n')
        return

    # Define collections to reference during
    # normalization and initialize stemmer
    punc = set(string.punctuation)
    stop = stopwords.words('english')
    # Iterate over sentences, normalizing and
    # creating vertices for our graph as we go
    i = 0
    for s in sentences:
        if len(s) > 1:
            l = (s.lower()).split(' ')
            # eliminate stop words
            norm = [w for w in l if w not in stop]
            # apply stemming to each word
            if stem_type == '-w':
                norm = [stemmer.lemmatize(w) for w in norm]
            else:
                norm = [stemmer.stem(w) for w in norm]
            # remove punctuation from each word
            temp = []
            for w in norm:
                w = ''.join([l for l in w if l not in punc])
                temp += w
                temp += ' '
                norm = ''.join(temp)
            G.add_node(i, iden=i, raw=s, nrm=norm)
            i += 1
    return G
Пример #14
0
def stemming(lines, algorithm=3):

    #selecting the algorithm to use
    #total 57370 in 12/11/2015
    if algorithm == 0:
        #results with this algorith 56700 features
        stemmer = PorterStemmer()
    elif algorithm == 1:
        #results with this algorith 57731 features
        stemmer = LancasterStemmer()
    elif algorithm == 2:
        #results with this algorith 58007 features
        stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4)
    elif algorithm == 3:
        #results with this algorith 56282 features (stopwords removed after stemmed)
        # 55230 if stopwords are remove first with method==2
        stemmer = SnowballStemmer("english")
    elif algorithm == 4:
        #results with this algorith 56795 features
        wnl = WordNetLemmatizer()
    else:
        raise ValueError('Algorithm values should [0-4] ')

    stemmed_lines = []
    # run thru all lines
    for each_line in lines:
        a_line_stemmed = ''

        #tokenize each line
        tokens = each_line.split()

        # run thru all tokens
        for each_token in tokens:
            #do the stemming to each token and join the tokens back togther
            if algorithm == 4:
                a_line_stemmed = a_line_stemmed + ' ' + wnl.lemmatize(
                    each_token)
            else:
                a_line_stemmed = a_line_stemmed + ' ' + stemmer.stem(
                    each_token)

        #recreate the list all over
        stemmed_lines.append(a_line_stemmed)
    return stemmed_lines
Пример #15
0
def example3(word='Amevive'):
    '''stem algorithm (词根解析算法)
    '''
    ''' nltk.stem.lancaster module '''  ## 推荐分词算法1
    from nltk.stem.lancaster import LancasterStemmer
    st = LancasterStemmer()
    print st.stem(word)
    ''' nltk.stem.porter module '''  ## 推荐分词算法2
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    print stemmer.stem(word)
    ''' nltk.stem.regexp module '''  ## 正则分词算法
    from nltk.stem import RegexpStemmer
    st = RegexpStemmer('ing$|s$|e$', min=4)
    print st.stem(word)
    ''' nltk.stem.snowball module '''  ## 多语言支持分词算法
    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer('english')  # Choose a language
    print stemmer.stem(word)
Пример #16
0
def stemming(tokens, Type='ps', rgxRule='ing$|s$|ed$', MIN=4):
    '''
    Code adopted from text Text-analytics-with-python-a-practical-dipanjan-sarkar
    this function stems the tokens to get the root
    Stemmers: 
       - LancasterStemmer 
       - RegexpStemmer #user defined rules
       - SnowballStemmer # can stem other languages
       - PorterStemmer
    '''
    stemmers = {
        'ps': PorterStemmer(),
        'ls': LancasterStemmer(),
        'sn': SnowballStemmer("english"),
        'rg': RegexpStemmer(rgxRule, MIN)
    }
    stemmer = stemmers[Type]
    stemmed_list = []
    for i in tokens:
        stemmed_list = stemmed_list + [stemmer.stem(i)]
    return stemmed_list
def remove_english(text, cooking_list):
    stemmer = RegexpStemmer("ed$|'s$")
    stemmer1 = RegexpStemmer("d$")
    text = treebank_tokenizer.tokenize(text)
    lemmatized_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    lemmatized_text = [w for w in lemmatized_text if w not in cooking_list]
    
    lemmatized_stemmed_text = []

    for w in lemmatized_text:
        w = stemmer.stem(w)
        w = stemmer1.stem(w)
        lemmatized_stemmed_text.append(w)
        
    tokenized_Italian_text = [w for w in lemmatized_stemmed_text if w not in words.words()]
    Italian_text = ' '.join(tokenized_Italian_text)
    
    Italian_text = re.sub('[^a-zA-ZÀ-ÿ.\s]', '', Italian_text) #removing all the numbers and special characters
    
    return Italian_text
Пример #18
0
#!/usr/bin/env python
# coding: utf-8

# # Task-6

# ## A. TYPES OF STEMMERS

# ### I. REGEX STEMMER 

# In[1]:


import nltk
from nltk.stem import RegexpStemmer
stemmerregexp=RegexpStemmer('ing')
stemmerregexp.stem('running')


# ### II. SNOWBALL STEMMER

# In[7]:


import nltk 
from nltk.stem import SnowballStemmer
SnowballStemmer.languages
frstemmer = SnowballStemmer('french')
frstemmer.stem('manges')


# ### III. LANCASTER STEMMER
# <nbformat>2</nbformat>

# <markdowncell>

# <h2>Stemming Words</h2>
# <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong>
#     growing</strong> is <strong>grow</strong>. </p>
# <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages
#     and is not covered here but is in the text </p>

# <codecell>

from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
reg = RegexpStemmer('ing')
g = 'growing'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)

# <markdowncell>

# <p>The output of various words can be different between stemmers:</p>

# <codecell>

g = 'cookery'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)
Пример #20
0
def OutputRelations(abstractFileName, seta, negSet, neutralSet, negationSet,
                    posSet, fullNames, threshold):
    #added threshold in input format
    #recent change: no longer using filename for abstract. instead, input the string of the abstract

    import nltk
    import copy
    import re
    from nltk.stem.lancaster import LancasterStemmer
    from nltk.stem import RegexpStemmer

    sentencedb = dict()
    fullnamestore = dict()
    a = readf(fullNames)
    for i in a:
        i = i.split(";")
        if len(i) > 1:
            #storing the full names, using the short symbols as dict keys
            fullnamestore[i[0]] = i[1]
        else:
            fullnamestore[i[0]] = "none"
    #sentencedb indexes the sentences by a unique identifier (int)

    def isGene(x, t, sentence):

        #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name
        if len(t) > 1 and len(x) > 2:

            if t.index(x) == 0:
                if t[t.index(x) + 1] in [">", "<", "=", "score"]:
                    return False
            elif t.index(x) == len(t) - 1:
                if t[t.index(x) - 1] in [">", "<", "=", "score"]:
                    return False
            elif (t[t.index(x) + 1] in [
                    ">", "<", "=", "score"
            ]) or (t[t.index(x) - 1] in [">", "<", "=", "score"]):
                return False
            elif (t[t.index(x) + 1], t[t.index(x) - 1]) == (")", "("):
                if x in fullnamestore:
                    if fullnamestore[x] != "none":
                        fullLength = len(fullnamestore[x])
                        #full length is length of full name
                        if t.index(x) > len(fullnamestore[x]) + 2:
                            if sentence[(t.index(x) - 1 -
                                         fullLength):(t.index(x) -
                                                      1)] == fullnamestore[x]:
                                return True
                            else:
                                return False

            else:
                return True
            return True
        else:
            return False

    def countgenes(s, geneset):
        #counts the number of unique genes in a sentence  "s"
        ss = nltk.word_tokenize(s)
        numgenes = 0
        existingGenes = []
        for i in ss:
            if i in geneset and isGene(i, ss, s) and i not in existingGenes:
                numgenes += 1
                existingGenes.append(i)

        return numgenes

    def countWords(gene1, gene2, token):

        #counts the words between gene 1 and gene2
        count = 0
        for i in xrange(token.index(gene1) + 1, token.index(gene2) - 1):
            count += 1
        return count

    #abstracts = open(abstractFileName,"r")

    storage = dict()

    b = []
    #a=a.replace("\n"," ")
    #for i in a.split("\n\n"):
    #   i=i.replace("\n"," ")
    #  b.append(i)
    #print b[4]
    #print b[-1].split()[3]
    for x in abstractFileName.split("\n\n"):
        x = x.replace("\n", " ")
        b.append(x)
        #print x
        #x =x.split("\t")
        #print x
    parsedB = []
    for line in b:
        if len(line) > 0:
            parsedB.append(line)
    b = parsedB
    # print b
    sentencelist = re.split("\. (?=[A-Z])", b[-2])
    sentencelistcopy = copy.deepcopy(sentencelist)
    l = len(sentencelist)
    for i in xrange(l):

        if countgenes(sentencelistcopy[i], seta) < 2:
            sentencelist.remove(sentencelistcopy[i])
        # print b[-1]
        storage[b[-1].split()[1]] = sentencelist

    #abstracts.close()
    #print sentencelistcopy,sentencelist,storage

    num_genes = 0
    bw = 0
    gene_names = seta

    st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4)

    def findsuf(string, x):
        a = ""
        for i in xrange(x):
            a += string[len(string) - 1 - (x - i - 1)]

        return a

    finalOutput = []

    for id in storage:
        countsentences = 0
        for sentence in storage[id]:

            rlist = [0, 0, 0]
            #sentence = storage[id]

            tokens = nltk.word_tokenize(sentence)
            tokenscopy = copy.deepcopy(tokens)
            tagged = nltk.pos_tag(tokens)

            for x in tagged:

                if x[1] in ['VBP', 'VBN', 'VBZ', 'VBG', 'VB']:
                    tokenscopy[tagged.index(x)] = st.stem(x[0])
            store = 0
            genes = []
            #print tokens,tokenscopy
            relation = 2
            currentlist = []
            direction = 0
            for x in tokens:

                if x in gene_names and x not in currentlist and isGene(
                        x, tokens, sentence):
                    genes.append(x)
                    num_genes += 1
                    currentlist.append(x)
                    #store = tokens.index(x)

            in1 = tokens.index(genes[0])
            in2 = tokens.index(genes[1])
            indexx = 0
            neg = 1
            if countWords(genes[0], genes[1], tokenscopy) <= threshold:

                for i in xrange(in1 + 1, in2):

                    if tokenscopy[i] in posSet:
                        relation = 1

                    elif tokenscopy[i] in negSet:
                        relation = -1
                    #elif tokenscopy[i] in neutralSet:
                    #relation = 0

                    if (tokenscopy[i] in negSet or tokenscopy[i] in posSet):
                        for y in xrange(in1 + 1,
                                        tokenscopy.index(tokenscopy[i])):
                            if tokenscopy[y] == "not":
                                relation = 0
                                #2 means neutral
                        if findsuf(tokens[i], 2) == "ed":
                            direction = 1

                        else:
                            direction = 0

                if direction == 0:
                    rlist = [genes[0], genes[1], relation]
                    #print genes[0],relation,genes[1]
                elif direction == 1:
                    rlist = [genes[1], genes[0], relation]
                    #print genes[1], relation, genes[0]
                # if relation!="none":
                if True:
                    #the above condition is so that it does not output sentences for which no relation
                    #has been found. This makes analysis easier. Must change this during final program.
                    sentencedb[countsentences] = sentence
                    #use this to have the sentences represented by a number
                    #change id to pmid
                    finalOutput.append(
                        [id, sentence, rlist[0], rlist[1], rlist[2]])
                    #use this to have the actual sentences in the output
                    #finalOutput.append([id,countsentences,rlist])

                    countsentences += 1

    return finalOutput
Пример #21
0
import pandas as pd
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import RegexpStemmer
from nltk.stem.snowball import SnowballStemmer
from tabulate import tabulate
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

stop = stopwords.words("english")
st = RegexpStemmer('ing$', min=8)
stemmer = SnowballStemmer("english")
encoding = "utf-8"


class LoadData():
    def __init__(self):
        pass

    def load_data_file(self, file):
        self.data = pd.read_csv(file)

    def test_train_data(self):
        msk = np.random.rand(len(self.data)) < 1.0
        train = self.data[msk]
        test = self.data[~msk]
Пример #22
0
    print(ls.stem(w))
stem_word_list = [ls.stem(w) for w in words_list]
print(stem_word_list.count('jump'))
print(stem_word_list)
print(ls.stem("lying"))
print(ls.stem("strange"))
"""
There are several other stemmers, including RegexpStemmer , where you can build
your own stemmer based on user-defined rules , and SnowballStemmer , which supports
stemming in 13 different languages besides English.
"""

#Regex Based stemmer
from nltk.stem import RegexpStemmer

rs = RegexpStemmer("ing$|s$|ed$", min=4)

for w in words_list:
    print(rs.stem(w))

print(rs.stem("lying"))
print(rs.stem("strange"))

#Snow Ball stemmer
from nltk.stem import SnowballStemmer

ss = SnowballStemmer("german")

print("supported languages are :", SnowballStemmer.languages)

german_cars = "autobahnen"
Пример #23
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import csv
from nltk.stem import RegexpStemmer


if __name__ == '__main__':
    patterns = 'i$|t$'
    regexp_stemmer = RegexpStemmer(patterns, 3)

    result_list = list()
    for word in ['Péter', 'szereti', 'Enikőt', 'és', 'Marit']:
        stem = regexp_stemmer.stem(word)
        result_list.append([word, stem])

    with open('output/regexp.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['word', 'stem'])
        for i in result_list:
            writer.writerow(i)

    print('See the result in output/regexp.csv')




Пример #24
0
# In[7]:


import nltk
from nltk.stem import LancasterStemmer
stemmerlanc=LancasterStemmer()
stemmerlanc.stem('darling')
#doesn't work here as well


# In[8]:


from nltk.stem import RegexpStemmer
regexpStemmer=RegexpStemmer('ing')
regexpStemmer.stem('dancing')
#doesn't support 


# In[10]:


import nltk 
from nltk.stem import SnowballStemmer 
SnowballStemmer.languages
frenchstemmer=SnowballStemmer('french')
frenchstemmer.stem('manges')


# In[11]:
Пример #25
0
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer, WordNetLemmatizer
from features.process_text.patterns import get_stemming_pattern
from nltk import pos_tag
from nltk.corpus import wordnet
import re
from features.process_text.tokenize import is_tokenized, merge_tokens, word_tokenize

_stemming_porter = PorterStemmer().stem

_stemming_lancaster = LancasterStemmer().stem

_stemming_regex = RegexpStemmer(get_stemming_pattern()).stem

_stemming_snowball = SnowballStemmer('english').stem

_STEMMING_DICT = {
    'porter': _stemming_porter,
    'lancaster': _stemming_lancaster,
    'regex': _stemming_regex,
    'snowball': _stemming_snowball
}


def convert_word_stem(string, stemming_id='porter'):
    test = string.split()
    """Converts words to word stem"""
    stemming = _STEMMING_DICT.get(stemming_id)
    return " ".join([stemming(word_token) for word_token in test])


#correcting repeated characters
Пример #26
0
import nltk
from nltk.stem import RegexpStemmer
st1 = RegexpStemmer('ing')
print("Learning - ", st1.stem('Learning'))
print("Singing - ", st1.stem('Singing'))

print()

st2 = RegexpStemmer('na')
print("Banana - ", st2.stem('Banana'))
Пример #27
0
print ps.stem('strange')

# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')


# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')

print rs.stem('strange')


# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

print 'Supported Languages:', SnowballStemmer.languages

# autobahnen -> cars
Пример #28
0
Created on Fri Apr  8 11:03:16 2016

@author: shen
"""

import time
start_time = time.time()

import numpy as np
import pandas as pd

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

from nltk.stem import RegexpStemmer
st = RegexpStemmer('s$', min=4)

import re, math
from collections import Counter

from sklearn.ensemble import RandomForestRegressor
from sklearn import pipeline, grid_search
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer
import random
random.seed(22)

strNum = {
Пример #29
0
import nltk
from nltk.stem import RegexpStemmer
stemmerregexp=RegexpStemmer('ing')
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('pairing'))

Пример #30
0
ls = LancasterStemmer()

print(ls.stem("jumping"))
print(ls.stem("jumps"))
print(ls.stem("jumper"))
print(ls.stem("strange"))
print(ls.stem("stranger"))
print(ls.stem("lying"))

# REGEXP STEMMER
# Uses regular expressions to identify the morphological affixes in words and any part of the
# string matching the same is removed

# Note that this stemmer is case sensitive (won't work on capitalized affixes)
rs = RegexpStemmer(r"ing$|s$|ed$", min=4)

print(rs.stem("jumping"))
print(rs.stem("colored"))
print(rs.stem("lying"))

# SNOWBALL STEMMER
# Stems words in a dozen of languages. http://snowballstem.org

ss = SnowballStemmer(language="german")

print("Supported languages: {}".format(SnowballStemmer.languages))

print(ss.stem("autobahnen"))
print(ss.stem("endlich"))
print(ss.stem("unglaublich"))
Пример #31
0
 def regexStemmer(self, term):
     v_sufixos = ['ando', 'endo', 's', 'é']
     expr = 's$|es$'
     stemmer = RegexpStemmer(expr)
     return stemmer.stem(term)
Пример #32
0
from nltk.tokenize import WhitespaceTokenizer

wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence5)

# 5. WordPunct Tokenizer
from nltk.tokenize import WordPunctTokenizer

wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence5)

# Regexp Stemmer
sentence6 = "I love playing Cricket. Cricket players practice hard."
from nltk.stem import RegexpStemmer

regex_stemmer = RegexpStemmer('ing$')

' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()])

# Porter Stemmer
sentence7 = "Before eating, it would be nice to sanitize your hands with a sanitizer"
from nltk.stem.porter import PorterStemmer

ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sentence7.split()])

# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
Пример #33
0
print([pst.stem(w) for w in words])

result = []
for w in words:
    result.append(pst.stem(w))

# Lancaster Stemmer
words = ["sending", "cooking", "files", "lives", "crying", "dying"]
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()  # object 생성
print([lst.stem(w) for w in words])

# 정규표현식(Regexp Stemmer)
words = ["sending", "cooking", "files", "lives", "crying", "dying"]
from nltk.stem import RegexpStemmer
lst = RegexpStemmer('ing')
print([lst.stem(w) for w in words])

# 스페인어 추출(Snowball Stemmer)
words2 = ['enviar', 'cocina', 'moscas', 'vidas', 'ilorar', 'morir']
from nltk.stem.snowball import SnowballStemmer
sbst = SnowballStemmer('spanish')
print([sbst.stem(w) for w in words2])

# 원형복원(WordNet Lemmatizer)
word3 = ['coocking', 'believes']
from nltk.stem.wordnet import WordNetLemmatizer
wl = WordNetLemmatizer()
print([wl.lemmatize(w) for w in word3])
print([wl.lemmatize(w, pos='v') for w in word3])
print ps.stem('strange')

# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')


# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')

print rs.stem('strange')


# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

print 'Supported Languages:', SnowballStemmer.languages

# autobahnen -> cars
Пример #35
0
def get_prescription(text):

    pstem = PorterStemmer()
    with open("symptoms.txt") as f:
        symptoms = f.readlines()
    finalsyns=[]
    for word in symptoms:
        syns = wordnet.synsets(word.strip())
        syns = [s.lemma_names() for s in syns ]
        merged = list(itertools.chain(*syns))
        if len(merged) == 0:
            finalsyns = finalsyns+[pstem.stem(word.strip())]
            ##print(finalsyns)
        else :
            finalsyns = finalsyns+merged
    finalsyns = [f.replace('\n','') for f in finalsyns]
    finalsyns = list(dict.fromkeys(finalsyns))
    #print(finalsyns)
    pstem = PorterStemmer()
    rstem = RegexpStemmer('\(s\)')
    def words_in_string(word_list, a_string):
        return set(word_list).intersection(a_string)


    with open("Amount.txt") as f:
        Amount = f.readlines()

    Amount = [rstem.stem(x.strip()).split(' - ') for x in Amount]
    Amount = list(chain(*Amount))
    prescription_dataset = tuple(open("dataset2.txt", 'r'))

    with open("Frequency.txt") as f:
        frequency = f.readlines()
    frequency = [rstem.stem(x.strip()).split(' - ') for x in frequency]
    frequency = list(chain(*frequency))

    schedule={}
    with open("schedule.txt") as f:
        for line in f:
             #print(line.split(':'))
             s = line.split(':')
             schedule[s[0].strip().lower()] = s[1].strip()

    data = {}
    prescription = text.lower()
    prescription_tokenized = [word.replace(".","").replace("(","").replace(")","") for word in prescription.split()]
    prescription_tokenized_final = [pstem.stem(word) for word in prescription_tokenized ]
    print(prescription)
    data.update({'prescription': prescription})
    amount=""
    for word in Amount:
        if pstem.stem(word) in prescription_tokenized_final or rstem.stem(word) in prescription_tokenized:
            index = prescription_tokenized_final.index(pstem.stem(word))
            amount = prescription_tokenized[index-1]+" "+prescription_tokenized[index]

    if amount is "":
        print("Amount not mentioned!")
    else:
        print("Amount : " + amount)
    freq= ""
    timing = ""
    if "every" in prescription or "each" in prescription:
        if "every" in prescription:
            ei = prescription_tokenized.index("every")
            re="every"
        elif "each in prescription":
            ei = prescription_tokenized.index("each")
            re="each"
        st = ["minutes","minute","hours","hour","meal","day","days","morning","evening","afternoon"]
        for i in range(0,10):
            s=st[i]
            if s in prescription_tokenized:
                ti=prescription_tokenized.index(s)
                if ti-ei==2:
                    freq=re+" "+prescription_tokenized[ei+1]+" "+s
                elif ti-ei==1 and i>=7:
                    freq= re+" "+ s
    if schedule.get(freq.strip()) is not None:
        timing = schedule.get(freq.strip())

    for word in frequency:
        if word in prescription and word.strip() is not "":
            freq=freq + " "+ word
            if schedule.get(word.strip()) is not None:
                timing = schedule.get(word.strip())
    if freq is "":
        print("No Frequency mentioned!")

    symptoms =""
    for s in finalsyns:
        if s in prescription:
            symptoms+= " "+s

    _check = ["", None]        
    if freq in _check:
        return {'error':"No prescription found!"}         
    data.update({"Amount":amount,"Symptoms" :symptoms,"Frequency":freq,"Timings":timing})

    return data
more_stop_en = set(get_stop_words("english"))
more_stop_es = set(get_stop_words("spanish"))
stop_es = set(stopwords.words("spanish"))
stop_en = set(stopwords.words("english"))
adj = Adjectives()
tdm = textmining.TermDocumentMatrix()

room = 0
# for line in var:
for line in sys.stdin:
    room += 1
    line = line.replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u")
    line = re.sub("[0-9,#,!,¡,&,.]", "", line)
    line = re.sub("[^a-zA-Z]", " ", line)
    words = line.lower().split()
    st = RegexpStemmer("ing$|s$|able$|thing$|ful$", min=4)
    words = [st.stem(w) for w in words]
    words = [w for w in words if not w in stop_en and not w in stop_es]
    words = [w for w in words if not w in more_stop_en and not w in more_stop_es]
    words = [w for w in words if len(w) > 2]
    tdm.add_doc(" ".join(words))

    good_count = [words.count(ad) for ad in adj.good if ad in words]
    good_count = len(good_count)

    bad_count = [words.count(ad) for ad in adj.bad if ad in words]
    bad_count = len(bad_count)

    print "Comentario " + str(room) + ": Sentiment Score: " + str(good_count - bad_count) + "\n"

Пример #37
0
lstemmer = LancasterStemmer()
lstemmer.stem('dancing')

"""
$$  LancasterStemmer - Most Aggressive.

    LancasterStemmer is mostly used for the cases where the 
    data or text is very huge, but your accuracy might falldown
    because of its most aggressive nature.

"""

######## RegexpStemmer ###########

rstemmer = RegexpStemmer('ing')
## remove all the letters except for a given word

rstemmer.stem('cooking')
rstemmer.stem('dancing')

rstemmer.stem('king')

## as you can only k is given if we have given king...
## so should be more carefull about it.

""" 
    That's the End of Stemming concept.
    If you have any questions or suggestions regarding the concept,
    feel free to contact me via [email protected]
    
def OutputRelations(abstractFileName,seta,negSet,neutralSet,negationSet,posSet,fullNames,threshold):
    #added threshold in input format
    #recent change: no longer using filename for abstract. instead, input the string of the abstract
                                                
                                                
                                                
    import nltk
    import copy
    import re
    from nltk.stem.lancaster import LancasterStemmer
    from nltk.stem import RegexpStemmer
    
    sentencedb = dict()
    fullnamestore = dict()
    a = readf(fullNames)
    for i in a:
        i = i.split(";")
        if len(i)>1:
            #storing the full names, using the short symbols as dict keys
            fullnamestore[i[0]] = i[1]
        else:
            fullnamestore[i[0]] = "none"
    #sentencedb indexes the sentences by a unique identifier (int)
    
    def isGene(x,t,sentence):
        
    
        #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name 
        if len(t)>1 and len(x)>2:
            
            if t.index(x) ==0:
                if t[t.index(x)+1] in [">","<","=","score"]:
                    return False
            elif t.index(x) ==len(t)-1:
                if t[t.index(x)-1] in [">","<","=","score"]:
                    return False
            elif(t[t.index(x)+1] in [">","<","=","score"])or (
                t[t.index(x)-1] in [">","<","=","score"]):
                return False
            elif (t[t.index(x)+1],t[t.index(x)-1])==(")","("):
                if x in fullnamestore:
                    if fullnamestore[x]!="none":
                        fullLength = len(fullnamestore[x])
                        #full length is length of full name
                        if t.index(x)>len(fullnamestore[x])+2:
                            if sentence[(t.index(x)-1-fullLength):(t.index(x)-1)]==fullnamestore[x]:
                                return True
                            else:
                                return False
                        
            else:
                return True
            return True
        else:
            return False

    def countgenes(s,geneset):
        #counts the number of unique genes in a sentence  "s"
        ss=nltk.word_tokenize(s)
        numgenes=0
        existingGenes = []
        for i in ss:
            if i in geneset and isGene(i,ss,s) and i not in existingGenes:
                numgenes+=1
                existingGenes.append(i)  
                
        
        return numgenes

    def countWords(gene1,gene2,token):
        
        #counts the words between gene 1 and gene2
        count = 0
        for i in xrange(token.index(gene1)+1,token.index(gene2) -1):
            count+=1
        return count
            
            

    
    #abstracts = open(abstractFileName,"r")
    
   
    storage = dict()
    
    
    

        
    b = []
#a=a.replace("\n"," ")
#for i in a.split("\n\n"):
 #   i=i.replace("\n"," ")
  #  b.append(i)
#print b[4]
#print b[-1].split()[3]
    for x in abstractFileName.split("\n\n"):
        x=x.replace("\n"," ")
        b.append(x)
        #print x
        #x =x.split("\t")
        #print x
    parsedB=[]
    for line in b:
        if len(line)>0:
            parsedB.append(line)
    b=parsedB
    # print b
    sentencelist =re.split("\. (?=[A-Z])",b[-2])
    sentencelistcopy=copy.deepcopy(sentencelist)
    l = len(sentencelist)
    for i in xrange(l):
        
        if countgenes(sentencelistcopy[i],seta)<2:
            sentencelist.remove(sentencelistcopy[i])
        # print b[-1]
        storage[b[-1].split()[1]] = sentencelist
        
    
    #abstracts.close()
    #print sentencelistcopy,sentencelist,storage
        


    
    num_genes=0
    bw=0
    gene_names = seta
    
    
 

    
    
    st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4)
    def findsuf(string,x):
        a = ""
        for i in xrange(x):
            a+=string[len(string)-1-(x-i-1)]
            
        return a
    finalOutput=[]

            
        
    for id in storage:
        countsentences=0
        for sentence in storage[id]:
            
            rlist = [0,0,0]
            #sentence = storage[id]
            
            tokens = nltk.word_tokenize(sentence)
            tokenscopy = copy.deepcopy(tokens)
            tagged = nltk.pos_tag(tokens)
            

            for x in tagged:
                
                if x[1] in ['VBP','VBN','VBZ','VBG','VB'] : 
                    tokenscopy[tagged.index(x)] = st.stem(x[0])
            store=0
            genes = []
            #print tokens,tokenscopy
            relation = 2
            currentlist = []
            direction = 0
            for x in tokens:
                
                if x in gene_names and x not in currentlist and isGene(x,tokens,sentence):
                    genes.append(x)
                    num_genes+=1
                    currentlist.append(x)
                    #store = tokens.index(x)
            
            in1 = tokens.index(genes[0])
            in2 = tokens.index(genes[1])
            indexx=0
            neg=1
            if countWords(genes[0],genes[1],tokenscopy)<=threshold:
                
                
                    
                    
                for i in xrange(in1 +1,in2):
                    
                    
                    if tokenscopy[i] in posSet:
                        relation = 1
                        
                        
                        
                    elif tokenscopy[i] in negSet:
                        relation = -1
                    #elif tokenscopy[i] in neutralSet:
                        #relation = 0
                    
                    if (tokenscopy[i] in negSet or tokenscopy[i] in
                        posSet):
                        for y in xrange(in1+1,tokenscopy.index(tokenscopy[i])):
                            if tokenscopy[y]=="not":
                                relation =0
                                #2 means neutral
                        if  findsuf(tokens[i],2)=="ed":
                            direction =1
                            
                        else:
                            direction =0
                        
                        
                if direction ==0:
                    rlist = [genes[0],genes[1],relation]
                    #print genes[0],relation,genes[1]
                elif direction == 1 :
                    rlist = [genes[1],genes[0],relation]
                    #print genes[1], relation, genes[0]
                # if relation!="none":
                if True:
                    #the above condition is so that it does not output sentences for which no relation
                    #has been found. This makes analysis easier. Must change this during final program.
                    sentencedb[countsentences]=sentence
                    #use this to have the sentences represented by a number
                    #change id to pmid
                    finalOutput.append([id,sentence,rlist[0],rlist[1],rlist[2]])
                    #use this to have the actual sentences in the output
                    #finalOutput.append([id,countsentences,rlist])
                    
                    countsentences+=1
          
             
    return finalOutput