示例#1
0
 def tokenise(self, stem: bool = False) -> List[str]:
     words = word_tokenize(self.content)
     if stem:
         stemmer = RegexpStemmer('ing$|s$|ed$|y$|er$|[^aeiou]{1}y$|e$',
                                 min=3)
         words = [stemmer.stem(word) for word in words]
     return words
def my_stem(word):
    st = RegexpStemmer('ness$|ity$|ment', min=4)
    if word.endswith('acy'):
        stem = word[:-2]
        stem += 'te'
    elif word.endswith('cy'):
        stem = word[:-2]
        stem += 't'

    elif word.endswith('ility'):
        stem = word[:-5]
        stem += 'le'
        if stem not in model.vocab:
            stem = word[:-3]

    # elif word.endswith('ality'):
    #     stem = word[:-5]
    #     if stem not in model.vocab:
    #         stem = word[:-3]

    elif word.endswith('ce'):
        stem = word[:-2]
        stem += 't'

    else:
        stem = st.stem(word)
        if stem.endswith('i'):
            stem = stem[:-1] + 'y'
    return stem
示例#3
0
def cleanText( raw_text ):
    text = raw_text

    # replace non-alpha characters
    text = re.sub( '[^a-z\s]+','', text, flags=re.IGNORECASE )

    # replace multiple spaces with a single one
    text = re.sub('(\s+)',' ', text )

    # converting string to lower case
    text = text.lower()

    # regex to remove punctuation
    tokenizer =  RegexpTokenizer( r'\w+' )

    # initial tokenization
    tokenized_text = tokenizer.tokenize( text )
    # stemmer to remove plurals
    stemmer = RegexpStemmer( 's$|ies$' )

    # remove stop words
    stop_words = set(['whom', 'that', 'those', "needn't", 'where', 'has', 'same', 'had', 'we', 'my', 'hers', 'does', 'they', 'the', 'only', "doesn't", 'be', 'mightn', 'her', 'wasn', 'being', 'am', 'but', 'themselves', 'during', "don't", 'into', 'its', 'isn', 'of', 'won', 'few', 'as', 'own', 'more', "shouldn't", 'myself', "mightn't", 'after', 'below', "didn't", "you've", 'wouldn', 'any', 'his', 'in', 'hasn', "weren't", 'him', 'she', 'will', "won't", 'it', 'y', 'he', 'now', 'such', 'haven', 'most', 'who', 'an', 'shan', 'at', "she's", 'were', 'weren', 'do', 'did', 've', 'all', 'between', 'above', "you're", 'no', "you'll", 'which', 'i',
'been', 'doesn', "hasn't", 'each', 'some', 'don', "aren't", 'should', 'mustn', 'our', "wouldn't", 'their', 'your', 'yours', 'doing', 'why', "hadn't", 'down', 'so', 'for', 'while', 'this', "shan't", 'there', 'needn', 'up', 'shouldn', 'by', "mustn't", 'have', 'yourself', "you'd", 'd', "haven't", 'about', 'ain', 'or', 'ourselves', 'when', "couldn't", 'is', 'with', "that'll", 'these', 'further', "should've", 'if', 'than', 'just', "wasn't", 'other', "isn't", 'you',
'then', 'how', 'too', 'until', 'very', 'are', 'to', 'itself', 'aren', 't', 'a', 'before', 'm', 'can', 'out', 'and', 'under', 'here', 'o', 'on', 'theirs', 'ma', 'couldn', 'having', 'himself', 'against', 'again', 'll', 'nor', 'hadn', 'ours', 'through', 'both', 'because', 'what', 's', 'them', 'not', 'off', 'me', "it's", 'once', 'over', 'didn', 'was', 're', 'from', 'yourselves', 'herself'])
    clean_text = []
    for word in tokenized_text:
        if word not in stop_words:
            # make plurals singular
            token = stemmer.stem( word )
            clean_text.append( token )

    return clean_text
        def f(s):
            if s is not None:
                line = s.lower().replace('"', ']').replace('\'', ' ')			# converting words in lowercase
                tokenized_words = word_tokenize(line)					# tokenizing

                regexFile="regex.txt"
                Snowballstemmer=SnowballStemmer("english")

                RegexStemmer=[]                                #Stemmer for Regular expression
                with open(regexFile,'r') as regFile: 
                    while True:
                        line = regFile.readline()
                        print(line)
                        if not line:
                            break
                        RegexStemmer=RegexpStemmer(line,min=2)

                data =filter(lambda x: x not in stopwords, tokenized_words)		# data=[tokenized_words - nouse_words]
                lmtzr = WordNetLemmatizer()
                list_of_words=[]
                for item in data:
                    if len(item)>2:							# words with length <=2 are removed
                        #rlemma=lmtzr.lemmatize(item)				# lemmatizing				
                        # stemming
                        x=RegexStemmer.stem(item)
                        #x=Snowballstemmer.stem(regx)

                        if len(x)>2:
                            list_of_words.append(x)					# adding item to list_of_words

                t = ' '.join(str(item) for item in list_of_words)
                return t
示例#5
0
def stemming(word):
    # Use stemmers for removing morphological affixes from words.
    # Portst = PorterStemmer()
    # Landst = LancasterStemmer()
    Regst = RegexpStemmer('ing|ed')
    new = Regst.stem(word)
    return new
示例#6
0
def stem_words(text):
    words = word_tokenize(text)
    #Regex for Suffixes
    st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4)
    stemmed = []
    for word in words:
        stemmed.append(st.stem(word))
    return ' '.join(stemmed)
示例#7
0
def stemming(word):
    # Use stemmers for removing morphological affixes from words.
    Portst = PorterStemmer()
    Landst = LancasterStemmer()
    Regst = RegexpStemmer('ing|ed')
    new = Portst.stem(word)
    if new == word:
        new = Landst.stem(word)
        if new == word:
            new = Regst.stem(word)
    return new
def remove_english(text, cooking_list):
    stemmer = RegexpStemmer("ed$|'s$")
    stemmer1 = RegexpStemmer("d$")
    text = treebank_tokenizer.tokenize(text)
    lemmatized_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    lemmatized_text = [w for w in lemmatized_text if w not in cooking_list]
    
    lemmatized_stemmed_text = []

    for w in lemmatized_text:
        w = stemmer.stem(w)
        w = stemmer1.stem(w)
        lemmatized_stemmed_text.append(w)
        
    tokenized_Italian_text = [w for w in lemmatized_stemmed_text if w not in words.words()]
    Italian_text = ' '.join(tokenized_Italian_text)
    
    Italian_text = re.sub('[^a-zA-ZÀ-ÿ.\s]', '', Italian_text) #removing all the numbers and special characters
    
    return Italian_text
示例#9
0
def analyze(text, stop, stem, wstem):
    # Set utilities
    if stop:
        stopeng = set(stopwords.words('english'))
    if wstem:
        stemmer = RegexpStemmer('ing$|s$|e$', min=4)
    if stem:
        stemmer = PorterStemmer()
    tok = RegexpTokenizer(r'\w+')
    # Remove weird characters
    text = stripSpecial(text)
    # Tokenize and lowercase
    text = tok.tokenize(text.lower())
    # Remove stopwords if flagged
    if stop:
        text = [w for w in text if w not in stopeng]
    # Stem if flagged
    if (stem or wstem):
        text = [stemmer.stem(w) for w in text]
    return ' '.join(text)
示例#10
0
class word_lemmatiser:
    def __init__(self, language):
        self.language = language
        if self.language == "eng":
            self.model = WordNetLemmatizer()
        elif self.language == "nso":
            self.model = RegexpStemmer('ng$', min=4)
        else:
            self.model = None

    def lemma(self, x):
        if self.language == "eng":
            return self.model.lemmatize(x[0])
        elif self.language == "nso":
            return self.model.stem(x[0].lower())
        elif self.language == "zul":
            return x[2]
        else:
            return x[0]

    def identity(self, word):
        return word
示例#11
0
    def word_refiner(*args):
        Portst = PorterStemmer()
        Landst = LancasterStemmer()
        Regst = RegexpStemmer('ing|ed|ly|lly')
        args = [i for i in args if isinstance(i, unicode)]

        for w in map(str, args):
            if w in dic1:
                yield w
            else:
                st1 = Portst.stem(w)
                if st1 in dic1:
                    yield st1
                else:
                    st2 = Landst.stem(w)
                    if st2 in dic1:
                        yield st2
                    else:
                        st3 = Regst.stem(w)
                        if st3 in dic1:
                            yield st3
                        else:
                            yield w
示例#12
0
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import RegexpStemmer

postammer = PorterStemmer()
print(postammer.stem('dancing'))

from nltk.stem import WordNetLemmatizer

lzr = WordNetLemmatizer()

print(lzr.lemmatize('dancing'))

#but if we want to make it any converting then we use

print(lzr.lemmatize('dancing', pos='v'))
lstemmer = LancasterStemmer()
print(lstemmer('cooking'))
#it just cut down the part what we givw in regexpress
Rexpress = RegexpStemmer('er')
print(Rexpress.stem('cooker'))
示例#13
0
import nltk
from nltk.stem import RegexpStemmer
st1 = RegexpStemmer('ing')
print("Learning - ", st1.stem('Learning'))
print("Singing - ", st1.stem('Singing'))

print()

st2 = RegexpStemmer('na')
print("Banana - ", st2.stem('Banana'))
示例#14
0
class MiningService(Base):
    def __init__(self, db):
        super().__init__(db)
        self.nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')

        # Remove affixes from a word: it's -> it, we'll -> we
        stemmer_pattern = r"’s$|n’t$|’ll$|’re$|’ve$|’d$|’m$|'s$"
        stemmer_pattern += r"|n't$|'ll$|'re$|'ve$|'d$|'m$|"
        self.stemmer = RegexpStemmer(stemmer_pattern)

        # Part-of-speech tagger
        self.tagger = nltk.tag.pos_tag
        self.wordnetlemmatize = WordNetLemmatizer()

        self._stop_words = None
        self._junk_symbols = None
        self._proper_nouns = None

    def start(self):
        nltk.download('averaged_perceptron_tagger',
                      download_dir=self.nltk_data_path)
        nltk.download('wordnet', download_dir=self.nltk_data_path)
        nltk.data.path.append(self.nltk_data_path)
        super().start()

    async def stop_words(self):
        if self._stop_words is None:
            docs = await self.db.stop_words.find({}, {'_id': 0}).to_list(None)
            self._stop_words = [doc['word'] for doc in docs]
        return self._stop_words

    async def junk_symbols(self):
        if self._junk_symbols is None:
            docs = await self.db.junk_symbols.find({}, {
                '_id': 0
            }).to_list(None)
            self._junk_symbols = [doc['symbol'] for doc in docs]
        return self._junk_symbols

    async def proper_nouns(self):
        if self._proper_nouns is None:
            docs = await self.db.proper_nouns.find({}, {
                '_id': 0
            }).to_list(None)
            self._proper_nouns = [doc['word'] for doc in docs]
        return self._proper_nouns

    async def parse_sentence(self, uuid, tuid, suid, sentence):
        words = []
        for w in (w.rstrip('’') for w in word_tokenize(sentence.lower())):
            if w.strip():
                words.append(w)

        data = {'suid': suid, 'uuid': uuid, 'tuid': tuid, 'words': words}
        data['lemmas'] = await self.mine_lemma_data(data['words'])
        return {'status_code': 200, 'data': data}

    async def mine_lemma_data(self, words):
        data = []
        for word, treebank_tag in self.tagger(words):
            lemword, part_of_speech = self.normalize_word(
                word.lower(), treebank_tag)

            if part_of_speech == 'proper_noun':
                continue

            if ((await self._is_not_stop_word(lemword))
                    and (await self._is_not_junk_symbol(lemword))
                    and (await self._is_not_proper_noun(lemword))
                    and self._is_legible_word(lemword)):

                data.append({
                    'lemword': lemword,
                    'part_of_speech': part_of_speech
                })
        return data

    async def _is_not_stop_word(self, word):
        return word not in (await self.stop_words())

    async def _is_not_junk_symbol(self, word):
        return word not in (await self.junk_symbols())

    async def _is_not_proper_noun(self, word):
        return word not in (await self.proper_nouns())

    @staticmethod
    def _is_legible_word(word):
        return re.search(r'^[a-zA-Z].*', word) is not None

    def normalize_word(self, word, treebank_tag):
        """Lemmatizing and stemming words.
           stemming: to remove affixes from a word, e.g we'll -> we
           lemmatizing: bring word to a root, e.g ran -> run, looking -> look
        """
        wordnet_pos, part_of_speech = self.get_wordnet_pos(treebank_tag)

        if wordnet_pos == wordnet.NOUN and part_of_speech == 'proper':
            return word, 'proper_noun'

        lemword = self.wordnetlemmatize.lemmatize(word, wordnet_pos)
        return self.stemmer.stem(lemword), part_of_speech

    @staticmethod
    def get_wordnet_pos(treebank_tag):
        """Treebank part-of-speech tagging correspondence to wordnet"""

        if treebank_tag == 'NNP':
            return wordnet.NOUN, 'proper'

        # JJ-adjective
        # JJR-adjective, comparative
        # JJS-adjective, superlative
        elif treebank_tag.startswith('J'):
            return wordnet.ADJ, 'adj'

        # VB-verb, base form
        # VBD-verb, past tense
        # VBG-verb, gerund or present participle; VBN-verb, past participle
        # VBP-verb, non-3rd person singular present
        # VBZ-verb, 3rd person singular present
        elif treebank_tag.startswith('V'):
            return wordnet.VERB, 'verb'

        # RB-adverb
        # RBR-adverb, comparative
        # RBS-adverb, superlative
        # RP-particle
        elif treebank_tag.startswith('R'):
            return wordnet.ADV, 'adv'

        # NN-noun
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN, 'noun'

        # default
        else:
            return wordnet.NOUN, ''
more_stop_es = set(get_stop_words("spanish"))
stop_es = set(stopwords.words("spanish"))
stop_en = set(stopwords.words("english"))
adj = Adjectives()
tdm = textmining.TermDocumentMatrix()

room = 0
# for line in var:
for line in sys.stdin:
    room += 1
    line = line.replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u")
    line = re.sub("[0-9,#,!,¡,&,.]", "", line)
    line = re.sub("[^a-zA-Z]", " ", line)
    words = line.lower().split()
    st = RegexpStemmer("ing$|s$|able$|thing$|ful$", min=4)
    words = [st.stem(w) for w in words]
    words = [w for w in words if not w in stop_en and not w in stop_es]
    words = [w for w in words if not w in more_stop_en and not w in more_stop_es]
    words = [w for w in words if len(w) > 2]
    tdm.add_doc(" ".join(words))

    good_count = [words.count(ad) for ad in adj.good if ad in words]
    good_count = len(good_count)

    bad_count = [words.count(ad) for ad in adj.bad if ad in words]
    bad_count = len(bad_count)

    print "Comentario " + str(room) + ": Sentiment Score: " + str(good_count - bad_count) + "\n"


for row in tdm.rows(cutoff=1):
示例#16
0
def OutputRelations(abstractFileName, seta, negSet, neutralSet, negationSet,
                    posSet, fullNames, threshold):
    #added threshold in input format
    #recent change: no longer using filename for abstract. instead, input the string of the abstract

    import nltk
    import copy
    import re
    from nltk.stem.lancaster import LancasterStemmer
    from nltk.stem import RegexpStemmer

    sentencedb = dict()
    fullnamestore = dict()
    a = readf(fullNames)
    for i in a:
        i = i.split(";")
        if len(i) > 1:
            #storing the full names, using the short symbols as dict keys
            fullnamestore[i[0]] = i[1]
        else:
            fullnamestore[i[0]] = "none"
    #sentencedb indexes the sentences by a unique identifier (int)

    def isGene(x, t, sentence):

        #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name
        if len(t) > 1 and len(x) > 2:

            if t.index(x) == 0:
                if t[t.index(x) + 1] in [">", "<", "=", "score"]:
                    return False
            elif t.index(x) == len(t) - 1:
                if t[t.index(x) - 1] in [">", "<", "=", "score"]:
                    return False
            elif (t[t.index(x) + 1] in [
                    ">", "<", "=", "score"
            ]) or (t[t.index(x) - 1] in [">", "<", "=", "score"]):
                return False
            elif (t[t.index(x) + 1], t[t.index(x) - 1]) == (")", "("):
                if x in fullnamestore:
                    if fullnamestore[x] != "none":
                        fullLength = len(fullnamestore[x])
                        #full length is length of full name
                        if t.index(x) > len(fullnamestore[x]) + 2:
                            if sentence[(t.index(x) - 1 -
                                         fullLength):(t.index(x) -
                                                      1)] == fullnamestore[x]:
                                return True
                            else:
                                return False

            else:
                return True
            return True
        else:
            return False

    def countgenes(s, geneset):
        #counts the number of unique genes in a sentence  "s"
        ss = nltk.word_tokenize(s)
        numgenes = 0
        existingGenes = []
        for i in ss:
            if i in geneset and isGene(i, ss, s) and i not in existingGenes:
                numgenes += 1
                existingGenes.append(i)

        return numgenes

    def countWords(gene1, gene2, token):

        #counts the words between gene 1 and gene2
        count = 0
        for i in xrange(token.index(gene1) + 1, token.index(gene2) - 1):
            count += 1
        return count

    #abstracts = open(abstractFileName,"r")

    storage = dict()

    b = []
    #a=a.replace("\n"," ")
    #for i in a.split("\n\n"):
    #   i=i.replace("\n"," ")
    #  b.append(i)
    #print b[4]
    #print b[-1].split()[3]
    for x in abstractFileName.split("\n\n"):
        x = x.replace("\n", " ")
        b.append(x)
        #print x
        #x =x.split("\t")
        #print x
    parsedB = []
    for line in b:
        if len(line) > 0:
            parsedB.append(line)
    b = parsedB
    # print b
    sentencelist = re.split("\. (?=[A-Z])", b[-2])
    sentencelistcopy = copy.deepcopy(sentencelist)
    l = len(sentencelist)
    for i in xrange(l):

        if countgenes(sentencelistcopy[i], seta) < 2:
            sentencelist.remove(sentencelistcopy[i])
        # print b[-1]
        storage[b[-1].split()[1]] = sentencelist

    #abstracts.close()
    #print sentencelistcopy,sentencelist,storage

    num_genes = 0
    bw = 0
    gene_names = seta

    st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4)

    def findsuf(string, x):
        a = ""
        for i in xrange(x):
            a += string[len(string) - 1 - (x - i - 1)]

        return a

    finalOutput = []

    for id in storage:
        countsentences = 0
        for sentence in storage[id]:

            rlist = [0, 0, 0]
            #sentence = storage[id]

            tokens = nltk.word_tokenize(sentence)
            tokenscopy = copy.deepcopy(tokens)
            tagged = nltk.pos_tag(tokens)

            for x in tagged:

                if x[1] in ['VBP', 'VBN', 'VBZ', 'VBG', 'VB']:
                    tokenscopy[tagged.index(x)] = st.stem(x[0])
            store = 0
            genes = []
            #print tokens,tokenscopy
            relation = 2
            currentlist = []
            direction = 0
            for x in tokens:

                if x in gene_names and x not in currentlist and isGene(
                        x, tokens, sentence):
                    genes.append(x)
                    num_genes += 1
                    currentlist.append(x)
                    #store = tokens.index(x)

            in1 = tokens.index(genes[0])
            in2 = tokens.index(genes[1])
            indexx = 0
            neg = 1
            if countWords(genes[0], genes[1], tokenscopy) <= threshold:

                for i in xrange(in1 + 1, in2):

                    if tokenscopy[i] in posSet:
                        relation = 1

                    elif tokenscopy[i] in negSet:
                        relation = -1
                    #elif tokenscopy[i] in neutralSet:
                    #relation = 0

                    if (tokenscopy[i] in negSet or tokenscopy[i] in posSet):
                        for y in xrange(in1 + 1,
                                        tokenscopy.index(tokenscopy[i])):
                            if tokenscopy[y] == "not":
                                relation = 0
                                #2 means neutral
                        if findsuf(tokens[i], 2) == "ed":
                            direction = 1

                        else:
                            direction = 0

                if direction == 0:
                    rlist = [genes[0], genes[1], relation]
                    #print genes[0],relation,genes[1]
                elif direction == 1:
                    rlist = [genes[1], genes[0], relation]
                    #print genes[1], relation, genes[0]
                # if relation!="none":
                if True:
                    #the above condition is so that it does not output sentences for which no relation
                    #has been found. This makes analysis easier. Must change this during final program.
                    sentencedb[countsentences] = sentence
                    #use this to have the sentences represented by a number
                    #change id to pmid
                    finalOutput.append(
                        [id, sentence, rlist[0], rlist[1], rlist[2]])
                    #use this to have the actual sentences in the output
                    #finalOutput.append([id,countsentences,rlist])

                    countsentences += 1

    return finalOutput
示例#17
0
#!/usr/bin/env python
# coding: utf-8

# # Task-6

# ## A. TYPES OF STEMMERS

# ### I. REGEX STEMMER 

# In[1]:


import nltk
from nltk.stem import RegexpStemmer
stemmerregexp=RegexpStemmer('ing')
stemmerregexp.stem('running')


# ### II. SNOWBALL STEMMER

# In[7]:


import nltk 
from nltk.stem import SnowballStemmer
SnowballStemmer.languages
frstemmer = SnowballStemmer('french')
frstemmer.stem('manges')


# ### III. LANCASTER STEMMER
示例#18
0
#The functions of the LancasterStemmer class are just like the functions of the
#PorterStemmer class, but can produce slightly different results. It is known to
#be slightly more aggressive than the PorterStemmer functions:

print()
print("method 2")
from nltk.stem import LancasterStemmer
s = LancasterStemmer()
print(s.stem('cooking'))
print(s.stem('cookery'))

#The RegexpStemmer class

#You can also construct your own stemmer using the RegexpStemmer class. It takes
#suffix that matches the expression:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ed')
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))
print(stemmer.stem('ingleside'))

#SNOWBALLSTEMMER.The SnowballStemmer class supports 15 non-English languages. It also provides two
#   English stemmers: the original porter algorithm as well as the new English stemming algorithm.

from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)
print(len(SnowballStemmer.languages))
st = SnowballStemmer('english')
print(st.stem('cooking'))
print(st.stem('studied'))
示例#19
0
# In[7]:


import nltk
from nltk.stem import LancasterStemmer
stemmerlanc=LancasterStemmer()
stemmerlanc.stem('darling')
#doesn't work here as well


# In[8]:


from nltk.stem import RegexpStemmer
regexpStemmer=RegexpStemmer('ing')
regexpStemmer.stem('dancing')
#doesn't support 


# In[10]:


import nltk 
from nltk.stem import SnowballStemmer 
SnowballStemmer.languages
frenchstemmer=SnowballStemmer('french')
frenchstemmer.stem('manges')


# In[11]:
示例#20
0
print(stemmerporter.stem('cooking') == 'cook')
print(stemmerporter.stem('cookery') == 'cookeri')
print(stemmerporter.stem('working'))
print(stemmerporter.stem('happiness'))

# 3_2
stemmerlan = LancasterStemmer()
print(stemmerlan.stem('cooking') == 'cook')
print(stemmerlan.stem('cookery') == 'cookery')
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))
print(stemmerlan.stem('achievement'))

# 3_3
stemmerregexp = RegexpStemmer('ing')
print(stemmerregexp.stem('cooking') == 'cook')
print(stemmerregexp.stem('cookery') == 'cookery')
print(stemmerregexp.stem('ingleside') == 'leside')
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('pairing'))

# 3_4
print(SnowballStemmer.languages == ('danish', 'dutch', 'english', 'finnish',
                                    'french', 'german', 'hungarian', 'italian',
                                    'norwegian', 'porter', 'portuguese',
                                    'romanian', 'russian', 'spanish',
                                    'swedish'))
stemmerspanish = SnowballStemmer('spanish')
print(stemmerspanish.stem('hola') == 'hol')
print(stemmerspanish.stem('comiendo'))
示例#21
0
 def regexStemmer(self, term):
     v_sufixos = ['ando', 'endo', 's', 'é']
     expr = 's$|es$'
     stemmer = RegexpStemmer(expr)
     return stemmer.stem(term)
示例#22
0
# porter词干提取算法
stemerporter = PorterStemmer()
print(stemerporter.stem('working'))
print(stemerporter.stem('happiness'))

# Lancaster词干提取算法,比porter涉及更多的情感词
from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))

# 设计自己的词干提取器
from nltk.stem import RegexpStemmer
stemmerregexp=RegexpStemmer('ing') # 去除ing
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('inghappiness'))

# 除英文外的14种语言的词干提取
from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)
# arabic(阿拉伯语) danish(丹麦语) dutch(荷兰语) finnish(芬兰语) french(法语) german(德语)
# hungarian(匈牙利语) italian(意大利语) norwegian(挪威语) porter portuguese(葡萄牙语)
# romanian(罗马尼亚语) russian(俄语) spanish(西班牙语) swedish(瑞典语)
spanishstemmer=SnowballStemmer('spanish')
print(spanishstemmer.stem('comiendo'))
frenchstemmer=SnowballStemmer('french')
print(frenchstemmer.stem('manger'))

def oo():
示例#23
0
    #     feature_set = FeatureSet( i, genres[0]['name'] )

    # feature_set = FeatureSet( i, genres['name'] )

    # print( genres[0]['name']  )

    if feature_set._genre not in unique_genres.keys():
        unique_genres.update({feature_set._genre: 1})
    else:
        unique_genres[
            feature_set._genre] = unique_genres[feature_set._genre] + 1

    for word in tokenized_string:
        if word not in stop_words:
            # make plurals singular
            token = stemmer.stem(word)
            # tokens.append( token )
            feature_set.incrementFrequency(token)
            if token not in vocabulary.keys():
                # print( token )
                document_list = []
                vocabulary.update({token: [i]})
            else:
                if i not in vocabulary[token]:
                    vocabulary[token].append(i)

    # print( feature_set._genre )

    feature_sets.append(feature_set)
# print(  vocabulary )
# calculate genre probability
示例#24
0
more_stop_en = set(get_stop_words('english'))
more_stop_es = set(get_stop_words('spanish'))
stop_es = set(stopwords.words("spanish"))
stop_en = set(stopwords.words("english"))
adj = Adjectives()

room = 0
#for line in var:
for line in sys.stdin:
	room+=1
	line = line.replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u');
	line = re.sub('[0-9,#,!,¡,&,.]','',line)
	line = re.sub('[^a-zA-Z]'," ",line)
	words = line.lower().split()
	st = RegexpStemmer('ing$|s$|able$|thing$|ful$', min=4)
	words = [st.stem(w) for w in words]
	words = [w for w in words if not w in stop_en and not w in stop_es]
	words = [w for w in words if not w in more_stop_en and not w in more_stop_es]
	words = [w for w in words if len(w) > 2]


	for i in words:
		if i not in dictionario:
			dictionario[i]=1
		else:
			a = dictionario[i]
			dictionario.update({i:a+1})


	good_count = [ words.count(ad) for ad in adj.good if ad in words]
	good_count = len(good_count)
示例#25
0
# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')


# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')

print rs.stem('strange')


# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

print 'Supported Languages:', SnowballStemmer.languages

# autobahnen -> cars
# autobahn -> car
ss.stem('autobahnen')
示例#26
0
文件: nlp2.py 项目: mittaln1612/nlp
sl.stem('connecting')

# In[29]:

sl.stem('connection')

# In[30]:

sl.stem('connections')

# In[31]:

from nltk.stem import RegexpStemmer
rx = RegexpStemmer('ing')
rx.stem('working')

# In[32]:

rx.stem('working')

# In[33]:

rx.stem('farming')

# In[34]:

rx.stem('happiness')

# In[35]:
示例#27
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import csv
from nltk.stem import RegexpStemmer


if __name__ == '__main__':
    patterns = 'i$|t$'
    regexp_stemmer = RegexpStemmer(patterns, 3)

    result_list = list()
    for word in ['Péter', 'szereti', 'Enikőt', 'és', 'Marit']:
        stem = regexp_stemmer.stem(word)
        result_list.append([word, stem])

    with open('output/regexp.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['word', 'stem'])
        for i in result_list:
            writer.writerow(i)

    print('See the result in output/regexp.csv')




示例#28
0
print("lowcase and split : ")
df['comments'].head(10)

# Remove stopwords
stop = stopwords.words('english')
df['comments'] = df['comments'].apply(
    lambda x: [item for item in x if item not in stop])
print("Remove stopwords : ")
df['comments'].head(10)

# Stemming
from nltk.stem import RegexpStemmer
st = RegexpStemmer('ing$|s$|e$|able$', min=4)
for x in df['comments']:
    for y in x:
        y = st.stem(y)
print("Stemming : ")
df['comments'].head(10)

# Remove Strings which length > 3
df['comments'] = df['comments'].apply(
    lambda x: [item for item in x if len(item) > 3])
print("Remove Strings which length > 3    : ")
df['comments'].head(10)

pre_end = time.time()
print("It cost %f sec" % (pre_end - pre_start))
""" 
Group Comments by the column of 'listing_id' 
"""
df2 = df[['listing_id', 'comments']].copy()
示例#29
0
stem_word_list = [ls.stem(w) for w in words_list]
print(stem_word_list.count('jump'))
print(stem_word_list)
print(ls.stem("lying"))
print(ls.stem("strange"))
"""
There are several other stemmers, including RegexpStemmer , where you can build
your own stemmer based on user-defined rules , and SnowballStemmer , which supports
stemming in 13 different languages besides English.
"""

#Regex Based stemmer
from nltk.stem import RegexpStemmer

rs = RegexpStemmer("ing$|s$|ed$", min=4)

for w in words_list:
    print(rs.stem(w))

print(rs.stem("lying"))
print(rs.stem("strange"))

#Snow Ball stemmer
from nltk.stem import SnowballStemmer

ss = SnowballStemmer("german")

print("supported languages are :", SnowballStemmer.languages)

german_cars = "autobahnen"
print(ss.stem(german_cars))
示例#30
0
wh_tokenizer = WhitespaceTokenizer()
wh_tokenizer.tokenize(sentence5)

# 5. WordPunct Tokenizer
from nltk.tokenize import WordPunctTokenizer

wp_tokenizer = WordPunctTokenizer()
wp_tokenizer.tokenize(sentence5)

# Regexp Stemmer
sentence6 = "I love playing Cricket. Cricket players practice hard."
from nltk.stem import RegexpStemmer

regex_stemmer = RegexpStemmer('ing$')

' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()])

# Porter Stemmer
sentence7 = "Before eating, it would be nice to sanitize your hands with a sanitizer"
from nltk.stem.porter import PorterStemmer

ps_stemmer = PorterStemmer()
' '.join([ps_stemmer.stem(wd) for wd in sentence7.split()])

# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
# <h2>Stemming Words</h2>
# <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong>
#     growing</strong> is <strong>grow</strong>. </p>
# <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages
#     and is not covered here but is in the text </p>

# <codecell>

from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
reg = RegexpStemmer('ing')
g = 'growing'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)

# <markdowncell>

# <p>The output of various words can be different between stemmers:</p>

# <codecell>

g = 'cookery'
print 'Porter yields: ',porter.stem(g)
print 'lancaster yields: ', lancaster.stem(g)
print 'Regexp yields: ', reg.stem(g)

# <markdowncell>

# <h2>Lemmatizing</h2>
# lancaster stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

print ls.stem('lying')

print ls.stem('strange')


# regex stemmer
from nltk.stem import RegexpStemmer
rs = RegexpStemmer('ing$|s$|ed$', min=4)

print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

print rs.stem('lying')

print rs.stem('strange')


# snowball stemmer
from nltk.stem import SnowballStemmer
ss = SnowballStemmer("german")

print 'Supported Languages:', SnowballStemmer.languages

# autobahnen -> cars
# autobahn -> car
ss.stem('autobahnen')
示例#33
0
import nltk
from nltk.stem import RegexpStemmer
stemmerregexp=RegexpStemmer('ing')
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('pairing'))

示例#34
0
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, RegexpStemmer
from nltk.stem.snowball import EnglishStemmer

stemmer = PorterStemmer()
print(stemmer.stem('cooking'))
print(stemmer.stem('cookery'))

stemmer2 = LancasterStemmer()
print(stemmer2.stem('cooking'))
print(stemmer2.stem('cookery'))

stemmer3 = SnowballStemmer('english')
print(stemmer3.stem('cooking'))
print(stemmer3.stem('cookery'))

# english is also Porter.
stemmer_en = EnglishStemmer()
print(stemmer_en.stem('cooking'))
print(stemmer_en.stem('cookery'))

# regex
stemmer_reg = RegexpStemmer('ing')
print(stemmer_reg.stem('cooking'))
print(stemmer_reg.stem('thing'))
示例#35
0
def get_prescription(text):

    pstem = PorterStemmer()
    with open("symptoms.txt") as f:
        symptoms = f.readlines()
    finalsyns=[]
    for word in symptoms:
        syns = wordnet.synsets(word.strip())
        syns = [s.lemma_names() for s in syns ]
        merged = list(itertools.chain(*syns))
        if len(merged) == 0:
            finalsyns = finalsyns+[pstem.stem(word.strip())]
            ##print(finalsyns)
        else :
            finalsyns = finalsyns+merged
    finalsyns = [f.replace('\n','') for f in finalsyns]
    finalsyns = list(dict.fromkeys(finalsyns))
    #print(finalsyns)
    pstem = PorterStemmer()
    rstem = RegexpStemmer('\(s\)')
    def words_in_string(word_list, a_string):
        return set(word_list).intersection(a_string)


    with open("Amount.txt") as f:
        Amount = f.readlines()

    Amount = [rstem.stem(x.strip()).split(' - ') for x in Amount]
    Amount = list(chain(*Amount))
    prescription_dataset = tuple(open("dataset2.txt", 'r'))

    with open("Frequency.txt") as f:
        frequency = f.readlines()
    frequency = [rstem.stem(x.strip()).split(' - ') for x in frequency]
    frequency = list(chain(*frequency))

    schedule={}
    with open("schedule.txt") as f:
        for line in f:
             #print(line.split(':'))
             s = line.split(':')
             schedule[s[0].strip().lower()] = s[1].strip()

    data = {}
    prescription = text.lower()
    prescription_tokenized = [word.replace(".","").replace("(","").replace(")","") for word in prescription.split()]
    prescription_tokenized_final = [pstem.stem(word) for word in prescription_tokenized ]
    print(prescription)
    data.update({'prescription': prescription})
    amount=""
    for word in Amount:
        if pstem.stem(word) in prescription_tokenized_final or rstem.stem(word) in prescription_tokenized:
            index = prescription_tokenized_final.index(pstem.stem(word))
            amount = prescription_tokenized[index-1]+" "+prescription_tokenized[index]

    if amount is "":
        print("Amount not mentioned!")
    else:
        print("Amount : " + amount)
    freq= ""
    timing = ""
    if "every" in prescription or "each" in prescription:
        if "every" in prescription:
            ei = prescription_tokenized.index("every")
            re="every"
        elif "each in prescription":
            ei = prescription_tokenized.index("each")
            re="each"
        st = ["minutes","minute","hours","hour","meal","day","days","morning","evening","afternoon"]
        for i in range(0,10):
            s=st[i]
            if s in prescription_tokenized:
                ti=prescription_tokenized.index(s)
                if ti-ei==2:
                    freq=re+" "+prescription_tokenized[ei+1]+" "+s
                elif ti-ei==1 and i>=7:
                    freq= re+" "+ s
    if schedule.get(freq.strip()) is not None:
        timing = schedule.get(freq.strip())

    for word in frequency:
        if word in prescription and word.strip() is not "":
            freq=freq + " "+ word
            if schedule.get(word.strip()) is not None:
                timing = schedule.get(word.strip())
    if freq is "":
        print("No Frequency mentioned!")

    symptoms =""
    for s in finalsyns:
        if s in prescription:
            symptoms+= " "+s

    _check = ["", None]        
    if freq in _check:
        return {'error':"No prescription found!"}         
    data.update({"Amount":amount,"Symptoms" :symptoms,"Frequency":freq,"Timings":timing})

    return data
示例#36
0
print(ls.stem("jumping"))
print(ls.stem("jumps"))
print(ls.stem("jumper"))
print(ls.stem("strange"))
print(ls.stem("stranger"))
print(ls.stem("lying"))

# REGEXP STEMMER
# Uses regular expressions to identify the morphological affixes in words and any part of the
# string matching the same is removed

# Note that this stemmer is case sensitive (won't work on capitalized affixes)
rs = RegexpStemmer(r"ing$|s$|ed$", min=4)

print(rs.stem("jumping"))
print(rs.stem("colored"))
print(rs.stem("lying"))

# SNOWBALL STEMMER
# Stems words in a dozen of languages. http://snowballstem.org

ss = SnowballStemmer(language="german")

print("Supported languages: {}".format(SnowballStemmer.languages))

print(ss.stem("autobahnen"))
print(ss.stem("endlich"))
print(ss.stem("unglaublich"))
print(ss.stem("untergehen"))
print(ss.stem("hauschen"))
示例#37
0
"""
$$  LancasterStemmer - Most Aggressive.

    LancasterStemmer is mostly used for the cases where the 
    data or text is very huge, but your accuracy might falldown
    because of its most aggressive nature.

"""

######## RegexpStemmer ###########

rstemmer = RegexpStemmer('ing')
## remove all the letters except for a given word

rstemmer.stem('cooking')
rstemmer.stem('dancing')

rstemmer.stem('king')

## as you can only k is given if we have given king...
## so should be more carefull about it.

""" 
    That's the End of Stemming concept.
    If you have any questions or suggestions regarding the concept,
    feel free to contact me via [email protected]
    
"""

def OutputRelations(abstractFileName,seta,negSet,neutralSet,negationSet,posSet,fullNames,threshold):
    #added threshold in input format
    #recent change: no longer using filename for abstract. instead, input the string of the abstract
                                                
                                                
                                                
    import nltk
    import copy
    import re
    from nltk.stem.lancaster import LancasterStemmer
    from nltk.stem import RegexpStemmer
    
    sentencedb = dict()
    fullnamestore = dict()
    a = readf(fullNames)
    for i in a:
        i = i.split(";")
        if len(i)>1:
            #storing the full names, using the short symbols as dict keys
            fullnamestore[i[0]] = i[1]
        else:
            fullnamestore[i[0]] = "none"
    #sentencedb indexes the sentences by a unique identifier (int)
    
    def isGene(x,t,sentence):
        
    
        #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name 
        if len(t)>1 and len(x)>2:
            
            if t.index(x) ==0:
                if t[t.index(x)+1] in [">","<","=","score"]:
                    return False
            elif t.index(x) ==len(t)-1:
                if t[t.index(x)-1] in [">","<","=","score"]:
                    return False
            elif(t[t.index(x)+1] in [">","<","=","score"])or (
                t[t.index(x)-1] in [">","<","=","score"]):
                return False
            elif (t[t.index(x)+1],t[t.index(x)-1])==(")","("):
                if x in fullnamestore:
                    if fullnamestore[x]!="none":
                        fullLength = len(fullnamestore[x])
                        #full length is length of full name
                        if t.index(x)>len(fullnamestore[x])+2:
                            if sentence[(t.index(x)-1-fullLength):(t.index(x)-1)]==fullnamestore[x]:
                                return True
                            else:
                                return False
                        
            else:
                return True
            return True
        else:
            return False

    def countgenes(s,geneset):
        #counts the number of unique genes in a sentence  "s"
        ss=nltk.word_tokenize(s)
        numgenes=0
        existingGenes = []
        for i in ss:
            if i in geneset and isGene(i,ss,s) and i not in existingGenes:
                numgenes+=1
                existingGenes.append(i)  
                
        
        return numgenes

    def countWords(gene1,gene2,token):
        
        #counts the words between gene 1 and gene2
        count = 0
        for i in xrange(token.index(gene1)+1,token.index(gene2) -1):
            count+=1
        return count
            
            

    
    #abstracts = open(abstractFileName,"r")
    
   
    storage = dict()
    
    
    

        
    b = []
#a=a.replace("\n"," ")
#for i in a.split("\n\n"):
 #   i=i.replace("\n"," ")
  #  b.append(i)
#print b[4]
#print b[-1].split()[3]
    for x in abstractFileName.split("\n\n"):
        x=x.replace("\n"," ")
        b.append(x)
        #print x
        #x =x.split("\t")
        #print x
    parsedB=[]
    for line in b:
        if len(line)>0:
            parsedB.append(line)
    b=parsedB
    # print b
    sentencelist =re.split("\. (?=[A-Z])",b[-2])
    sentencelistcopy=copy.deepcopy(sentencelist)
    l = len(sentencelist)
    for i in xrange(l):
        
        if countgenes(sentencelistcopy[i],seta)<2:
            sentencelist.remove(sentencelistcopy[i])
        # print b[-1]
        storage[b[-1].split()[1]] = sentencelist
        
    
    #abstracts.close()
    #print sentencelistcopy,sentencelist,storage
        


    
    num_genes=0
    bw=0
    gene_names = seta
    
    
 

    
    
    st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4)
    def findsuf(string,x):
        a = ""
        for i in xrange(x):
            a+=string[len(string)-1-(x-i-1)]
            
        return a
    finalOutput=[]

            
        
    for id in storage:
        countsentences=0
        for sentence in storage[id]:
            
            rlist = [0,0,0]
            #sentence = storage[id]
            
            tokens = nltk.word_tokenize(sentence)
            tokenscopy = copy.deepcopy(tokens)
            tagged = nltk.pos_tag(tokens)
            

            for x in tagged:
                
                if x[1] in ['VBP','VBN','VBZ','VBG','VB'] : 
                    tokenscopy[tagged.index(x)] = st.stem(x[0])
            store=0
            genes = []
            #print tokens,tokenscopy
            relation = 2
            currentlist = []
            direction = 0
            for x in tokens:
                
                if x in gene_names and x not in currentlist and isGene(x,tokens,sentence):
                    genes.append(x)
                    num_genes+=1
                    currentlist.append(x)
                    #store = tokens.index(x)
            
            in1 = tokens.index(genes[0])
            in2 = tokens.index(genes[1])
            indexx=0
            neg=1
            if countWords(genes[0],genes[1],tokenscopy)<=threshold:
                
                
                    
                    
                for i in xrange(in1 +1,in2):
                    
                    
                    if tokenscopy[i] in posSet:
                        relation = 1
                        
                        
                        
                    elif tokenscopy[i] in negSet:
                        relation = -1
                    #elif tokenscopy[i] in neutralSet:
                        #relation = 0
                    
                    if (tokenscopy[i] in negSet or tokenscopy[i] in
                        posSet):
                        for y in xrange(in1+1,tokenscopy.index(tokenscopy[i])):
                            if tokenscopy[y]=="not":
                                relation =0
                                #2 means neutral
                        if  findsuf(tokens[i],2)=="ed":
                            direction =1
                            
                        else:
                            direction =0
                        
                        
                if direction ==0:
                    rlist = [genes[0],genes[1],relation]
                    #print genes[0],relation,genes[1]
                elif direction == 1 :
                    rlist = [genes[1],genes[0],relation]
                    #print genes[1], relation, genes[0]
                # if relation!="none":
                if True:
                    #the above condition is so that it does not output sentences for which no relation
                    #has been found. This makes analysis easier. Must change this during final program.
                    sentencedb[countsentences]=sentence
                    #use this to have the sentences represented by a number
                    #change id to pmid
                    finalOutput.append([id,sentence,rlist[0],rlist[1],rlist[2]])
                    #use this to have the actual sentences in the output
                    #finalOutput.append([id,countsentences,rlist])
                    
                    countsentences+=1
          
             
    return finalOutput