def tokenise(self, stem: bool = False) -> List[str]: words = word_tokenize(self.content) if stem: stemmer = RegexpStemmer('ing$|s$|ed$|y$|er$|[^aeiou]{1}y$|e$', min=3) words = [stemmer.stem(word) for word in words] return words
def my_stem(word): st = RegexpStemmer('ness$|ity$|ment', min=4) if word.endswith('acy'): stem = word[:-2] stem += 'te' elif word.endswith('cy'): stem = word[:-2] stem += 't' elif word.endswith('ility'): stem = word[:-5] stem += 'le' if stem not in model.vocab: stem = word[:-3] # elif word.endswith('ality'): # stem = word[:-5] # if stem not in model.vocab: # stem = word[:-3] elif word.endswith('ce'): stem = word[:-2] stem += 't' else: stem = st.stem(word) if stem.endswith('i'): stem = stem[:-1] + 'y' return stem
def cleanText( raw_text ): text = raw_text # replace non-alpha characters text = re.sub( '[^a-z\s]+','', text, flags=re.IGNORECASE ) # replace multiple spaces with a single one text = re.sub('(\s+)',' ', text ) # converting string to lower case text = text.lower() # regex to remove punctuation tokenizer = RegexpTokenizer( r'\w+' ) # initial tokenization tokenized_text = tokenizer.tokenize( text ) # stemmer to remove plurals stemmer = RegexpStemmer( 's$|ies$' ) # remove stop words stop_words = set(['whom', 'that', 'those', "needn't", 'where', 'has', 'same', 'had', 'we', 'my', 'hers', 'does', 'they', 'the', 'only', "doesn't", 'be', 'mightn', 'her', 'wasn', 'being', 'am', 'but', 'themselves', 'during', "don't", 'into', 'its', 'isn', 'of', 'won', 'few', 'as', 'own', 'more', "shouldn't", 'myself', "mightn't", 'after', 'below', "didn't", "you've", 'wouldn', 'any', 'his', 'in', 'hasn', "weren't", 'him', 'she', 'will', "won't", 'it', 'y', 'he', 'now', 'such', 'haven', 'most', 'who', 'an', 'shan', 'at', "she's", 'were', 'weren', 'do', 'did', 've', 'all', 'between', 'above', "you're", 'no', "you'll", 'which', 'i', 'been', 'doesn', "hasn't", 'each', 'some', 'don', "aren't", 'should', 'mustn', 'our', "wouldn't", 'their', 'your', 'yours', 'doing', 'why', "hadn't", 'down', 'so', 'for', 'while', 'this', "shan't", 'there', 'needn', 'up', 'shouldn', 'by', "mustn't", 'have', 'yourself', "you'd", 'd', "haven't", 'about', 'ain', 'or', 'ourselves', 'when', "couldn't", 'is', 'with', "that'll", 'these', 'further', "should've", 'if', 'than', 'just', "wasn't", 'other', "isn't", 'you', 'then', 'how', 'too', 'until', 'very', 'are', 'to', 'itself', 'aren', 't', 'a', 'before', 'm', 'can', 'out', 'and', 'under', 'here', 'o', 'on', 'theirs', 'ma', 'couldn', 'having', 'himself', 'against', 'again', 'll', 'nor', 'hadn', 'ours', 'through', 'both', 'because', 'what', 's', 'them', 'not', 'off', 'me', "it's", 'once', 'over', 'didn', 'was', 're', 'from', 'yourselves', 'herself']) clean_text = [] for word in tokenized_text: if word not in stop_words: # make plurals singular token = stemmer.stem( word ) clean_text.append( token ) return clean_text
def f(s): if s is not None: line = s.lower().replace('"', ']').replace('\'', ' ') # converting words in lowercase tokenized_words = word_tokenize(line) # tokenizing regexFile="regex.txt" Snowballstemmer=SnowballStemmer("english") RegexStemmer=[] #Stemmer for Regular expression with open(regexFile,'r') as regFile: while True: line = regFile.readline() print(line) if not line: break RegexStemmer=RegexpStemmer(line,min=2) data =filter(lambda x: x not in stopwords, tokenized_words) # data=[tokenized_words - nouse_words] lmtzr = WordNetLemmatizer() list_of_words=[] for item in data: if len(item)>2: # words with length <=2 are removed #rlemma=lmtzr.lemmatize(item) # lemmatizing # stemming x=RegexStemmer.stem(item) #x=Snowballstemmer.stem(regx) if len(x)>2: list_of_words.append(x) # adding item to list_of_words t = ' '.join(str(item) for item in list_of_words) return t
def stemming(word): # Use stemmers for removing morphological affixes from words. # Portst = PorterStemmer() # Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed') new = Regst.stem(word) return new
def stem_words(text): words = word_tokenize(text) #Regex for Suffixes st = RegexpStemmer('ing$|s$|able$|ible$|ful$|less$|ive$|acy$|al$|ance$|ence$|dom$|er$|or$|ism$|ist$|ity$|ty$|ment$|ship$|sion$|tion$|ate$|en$|ify$|fy$|ize$|ise$', min=4) stemmed = [] for word in words: stemmed.append(st.stem(word)) return ' '.join(stemmed)
def stemming(word): # Use stemmers for removing morphological affixes from words. Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed') new = Portst.stem(word) if new == word: new = Landst.stem(word) if new == word: new = Regst.stem(word) return new
def remove_english(text, cooking_list): stemmer = RegexpStemmer("ed$|'s$") stemmer1 = RegexpStemmer("d$") text = treebank_tokenizer.tokenize(text) lemmatized_text = [wordnet_lemmatizer.lemmatize(word) for word in text] lemmatized_text = [w for w in lemmatized_text if w not in cooking_list] lemmatized_stemmed_text = [] for w in lemmatized_text: w = stemmer.stem(w) w = stemmer1.stem(w) lemmatized_stemmed_text.append(w) tokenized_Italian_text = [w for w in lemmatized_stemmed_text if w not in words.words()] Italian_text = ' '.join(tokenized_Italian_text) Italian_text = re.sub('[^a-zA-ZÀ-ÿ.\s]', '', Italian_text) #removing all the numbers and special characters return Italian_text
def analyze(text, stop, stem, wstem): # Set utilities if stop: stopeng = set(stopwords.words('english')) if wstem: stemmer = RegexpStemmer('ing$|s$|e$', min=4) if stem: stemmer = PorterStemmer() tok = RegexpTokenizer(r'\w+') # Remove weird characters text = stripSpecial(text) # Tokenize and lowercase text = tok.tokenize(text.lower()) # Remove stopwords if flagged if stop: text = [w for w in text if w not in stopeng] # Stem if flagged if (stem or wstem): text = [stemmer.stem(w) for w in text] return ' '.join(text)
class word_lemmatiser: def __init__(self, language): self.language = language if self.language == "eng": self.model = WordNetLemmatizer() elif self.language == "nso": self.model = RegexpStemmer('ng$', min=4) else: self.model = None def lemma(self, x): if self.language == "eng": return self.model.lemmatize(x[0]) elif self.language == "nso": return self.model.stem(x[0].lower()) elif self.language == "zul": return x[2] else: return x[0] def identity(self, word): return word
def word_refiner(*args): Portst = PorterStemmer() Landst = LancasterStemmer() Regst = RegexpStemmer('ing|ed|ly|lly') args = [i for i in args if isinstance(i, unicode)] for w in map(str, args): if w in dic1: yield w else: st1 = Portst.stem(w) if st1 in dic1: yield st1 else: st2 = Landst.stem(w) if st2 in dic1: yield st2 else: st3 = Regst.stem(w) if st3 in dic1: yield st3 else: yield w
from nltk.stem import PorterStemmer from nltk.stem import LancasterStemmer from nltk.stem import RegexpStemmer postammer = PorterStemmer() print(postammer.stem('dancing')) from nltk.stem import WordNetLemmatizer lzr = WordNetLemmatizer() print(lzr.lemmatize('dancing')) #but if we want to make it any converting then we use print(lzr.lemmatize('dancing', pos='v')) lstemmer = LancasterStemmer() print(lstemmer('cooking')) #it just cut down the part what we givw in regexpress Rexpress = RegexpStemmer('er') print(Rexpress.stem('cooker'))
import nltk from nltk.stem import RegexpStemmer st1 = RegexpStemmer('ing') print("Learning - ", st1.stem('Learning')) print("Singing - ", st1.stem('Singing')) print() st2 = RegexpStemmer('na') print("Banana - ", st2.stem('Banana'))
class MiningService(Base): def __init__(self, db): super().__init__(db) self.nltk_data_path = os.path.join(os.getcwd(), 'nltk_data') # Remove affixes from a word: it's -> it, we'll -> we stemmer_pattern = r"’s$|n’t$|’ll$|’re$|’ve$|’d$|’m$|'s$" stemmer_pattern += r"|n't$|'ll$|'re$|'ve$|'d$|'m$|" self.stemmer = RegexpStemmer(stemmer_pattern) # Part-of-speech tagger self.tagger = nltk.tag.pos_tag self.wordnetlemmatize = WordNetLemmatizer() self._stop_words = None self._junk_symbols = None self._proper_nouns = None def start(self): nltk.download('averaged_perceptron_tagger', download_dir=self.nltk_data_path) nltk.download('wordnet', download_dir=self.nltk_data_path) nltk.data.path.append(self.nltk_data_path) super().start() async def stop_words(self): if self._stop_words is None: docs = await self.db.stop_words.find({}, {'_id': 0}).to_list(None) self._stop_words = [doc['word'] for doc in docs] return self._stop_words async def junk_symbols(self): if self._junk_symbols is None: docs = await self.db.junk_symbols.find({}, { '_id': 0 }).to_list(None) self._junk_symbols = [doc['symbol'] for doc in docs] return self._junk_symbols async def proper_nouns(self): if self._proper_nouns is None: docs = await self.db.proper_nouns.find({}, { '_id': 0 }).to_list(None) self._proper_nouns = [doc['word'] for doc in docs] return self._proper_nouns async def parse_sentence(self, uuid, tuid, suid, sentence): words = [] for w in (w.rstrip('’') for w in word_tokenize(sentence.lower())): if w.strip(): words.append(w) data = {'suid': suid, 'uuid': uuid, 'tuid': tuid, 'words': words} data['lemmas'] = await self.mine_lemma_data(data['words']) return {'status_code': 200, 'data': data} async def mine_lemma_data(self, words): data = [] for word, treebank_tag in self.tagger(words): lemword, part_of_speech = self.normalize_word( word.lower(), treebank_tag) if part_of_speech == 'proper_noun': continue if ((await self._is_not_stop_word(lemword)) and (await self._is_not_junk_symbol(lemword)) and (await self._is_not_proper_noun(lemword)) and self._is_legible_word(lemword)): data.append({ 'lemword': lemword, 'part_of_speech': part_of_speech }) return data async def _is_not_stop_word(self, word): return word not in (await self.stop_words()) async def _is_not_junk_symbol(self, word): return word not in (await self.junk_symbols()) async def _is_not_proper_noun(self, word): return word not in (await self.proper_nouns()) @staticmethod def _is_legible_word(word): return re.search(r'^[a-zA-Z].*', word) is not None def normalize_word(self, word, treebank_tag): """Lemmatizing and stemming words. stemming: to remove affixes from a word, e.g we'll -> we lemmatizing: bring word to a root, e.g ran -> run, looking -> look """ wordnet_pos, part_of_speech = self.get_wordnet_pos(treebank_tag) if wordnet_pos == wordnet.NOUN and part_of_speech == 'proper': return word, 'proper_noun' lemword = self.wordnetlemmatize.lemmatize(word, wordnet_pos) return self.stemmer.stem(lemword), part_of_speech @staticmethod def get_wordnet_pos(treebank_tag): """Treebank part-of-speech tagging correspondence to wordnet""" if treebank_tag == 'NNP': return wordnet.NOUN, 'proper' # JJ-adjective # JJR-adjective, comparative # JJS-adjective, superlative elif treebank_tag.startswith('J'): return wordnet.ADJ, 'adj' # VB-verb, base form # VBD-verb, past tense # VBG-verb, gerund or present participle; VBN-verb, past participle # VBP-verb, non-3rd person singular present # VBZ-verb, 3rd person singular present elif treebank_tag.startswith('V'): return wordnet.VERB, 'verb' # RB-adverb # RBR-adverb, comparative # RBS-adverb, superlative # RP-particle elif treebank_tag.startswith('R'): return wordnet.ADV, 'adv' # NN-noun elif treebank_tag.startswith('N'): return wordnet.NOUN, 'noun' # default else: return wordnet.NOUN, ''
more_stop_es = set(get_stop_words("spanish")) stop_es = set(stopwords.words("spanish")) stop_en = set(stopwords.words("english")) adj = Adjectives() tdm = textmining.TermDocumentMatrix() room = 0 # for line in var: for line in sys.stdin: room += 1 line = line.replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u") line = re.sub("[0-9,#,!,¡,&,.]", "", line) line = re.sub("[^a-zA-Z]", " ", line) words = line.lower().split() st = RegexpStemmer("ing$|s$|able$|thing$|ful$", min=4) words = [st.stem(w) for w in words] words = [w for w in words if not w in stop_en and not w in stop_es] words = [w for w in words if not w in more_stop_en and not w in more_stop_es] words = [w for w in words if len(w) > 2] tdm.add_doc(" ".join(words)) good_count = [words.count(ad) for ad in adj.good if ad in words] good_count = len(good_count) bad_count = [words.count(ad) for ad in adj.bad if ad in words] bad_count = len(bad_count) print "Comentario " + str(room) + ": Sentiment Score: " + str(good_count - bad_count) + "\n" for row in tdm.rows(cutoff=1):
def OutputRelations(abstractFileName, seta, negSet, neutralSet, negationSet, posSet, fullNames, threshold): #added threshold in input format #recent change: no longer using filename for abstract. instead, input the string of the abstract import nltk import copy import re from nltk.stem.lancaster import LancasterStemmer from nltk.stem import RegexpStemmer sentencedb = dict() fullnamestore = dict() a = readf(fullNames) for i in a: i = i.split(";") if len(i) > 1: #storing the full names, using the short symbols as dict keys fullnamestore[i[0]] = i[1] else: fullnamestore[i[0]] = "none" #sentencedb indexes the sentences by a unique identifier (int) def isGene(x, t, sentence): #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name if len(t) > 1 and len(x) > 2: if t.index(x) == 0: if t[t.index(x) + 1] in [">", "<", "=", "score"]: return False elif t.index(x) == len(t) - 1: if t[t.index(x) - 1] in [">", "<", "=", "score"]: return False elif (t[t.index(x) + 1] in [ ">", "<", "=", "score" ]) or (t[t.index(x) - 1] in [">", "<", "=", "score"]): return False elif (t[t.index(x) + 1], t[t.index(x) - 1]) == (")", "("): if x in fullnamestore: if fullnamestore[x] != "none": fullLength = len(fullnamestore[x]) #full length is length of full name if t.index(x) > len(fullnamestore[x]) + 2: if sentence[(t.index(x) - 1 - fullLength):(t.index(x) - 1)] == fullnamestore[x]: return True else: return False else: return True return True else: return False def countgenes(s, geneset): #counts the number of unique genes in a sentence "s" ss = nltk.word_tokenize(s) numgenes = 0 existingGenes = [] for i in ss: if i in geneset and isGene(i, ss, s) and i not in existingGenes: numgenes += 1 existingGenes.append(i) return numgenes def countWords(gene1, gene2, token): #counts the words between gene 1 and gene2 count = 0 for i in xrange(token.index(gene1) + 1, token.index(gene2) - 1): count += 1 return count #abstracts = open(abstractFileName,"r") storage = dict() b = [] #a=a.replace("\n"," ") #for i in a.split("\n\n"): # i=i.replace("\n"," ") # b.append(i) #print b[4] #print b[-1].split()[3] for x in abstractFileName.split("\n\n"): x = x.replace("\n", " ") b.append(x) #print x #x =x.split("\t") #print x parsedB = [] for line in b: if len(line) > 0: parsedB.append(line) b = parsedB # print b sentencelist = re.split("\. (?=[A-Z])", b[-2]) sentencelistcopy = copy.deepcopy(sentencelist) l = len(sentencelist) for i in xrange(l): if countgenes(sentencelistcopy[i], seta) < 2: sentencelist.remove(sentencelistcopy[i]) # print b[-1] storage[b[-1].split()[1]] = sentencelist #abstracts.close() #print sentencelistcopy,sentencelist,storage num_genes = 0 bw = 0 gene_names = seta st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4) def findsuf(string, x): a = "" for i in xrange(x): a += string[len(string) - 1 - (x - i - 1)] return a finalOutput = [] for id in storage: countsentences = 0 for sentence in storage[id]: rlist = [0, 0, 0] #sentence = storage[id] tokens = nltk.word_tokenize(sentence) tokenscopy = copy.deepcopy(tokens) tagged = nltk.pos_tag(tokens) for x in tagged: if x[1] in ['VBP', 'VBN', 'VBZ', 'VBG', 'VB']: tokenscopy[tagged.index(x)] = st.stem(x[0]) store = 0 genes = [] #print tokens,tokenscopy relation = 2 currentlist = [] direction = 0 for x in tokens: if x in gene_names and x not in currentlist and isGene( x, tokens, sentence): genes.append(x) num_genes += 1 currentlist.append(x) #store = tokens.index(x) in1 = tokens.index(genes[0]) in2 = tokens.index(genes[1]) indexx = 0 neg = 1 if countWords(genes[0], genes[1], tokenscopy) <= threshold: for i in xrange(in1 + 1, in2): if tokenscopy[i] in posSet: relation = 1 elif tokenscopy[i] in negSet: relation = -1 #elif tokenscopy[i] in neutralSet: #relation = 0 if (tokenscopy[i] in negSet or tokenscopy[i] in posSet): for y in xrange(in1 + 1, tokenscopy.index(tokenscopy[i])): if tokenscopy[y] == "not": relation = 0 #2 means neutral if findsuf(tokens[i], 2) == "ed": direction = 1 else: direction = 0 if direction == 0: rlist = [genes[0], genes[1], relation] #print genes[0],relation,genes[1] elif direction == 1: rlist = [genes[1], genes[0], relation] #print genes[1], relation, genes[0] # if relation!="none": if True: #the above condition is so that it does not output sentences for which no relation #has been found. This makes analysis easier. Must change this during final program. sentencedb[countsentences] = sentence #use this to have the sentences represented by a number #change id to pmid finalOutput.append( [id, sentence, rlist[0], rlist[1], rlist[2]]) #use this to have the actual sentences in the output #finalOutput.append([id,countsentences,rlist]) countsentences += 1 return finalOutput
#!/usr/bin/env python # coding: utf-8 # # Task-6 # ## A. TYPES OF STEMMERS # ### I. REGEX STEMMER # In[1]: import nltk from nltk.stem import RegexpStemmer stemmerregexp=RegexpStemmer('ing') stemmerregexp.stem('running') # ### II. SNOWBALL STEMMER # In[7]: import nltk from nltk.stem import SnowballStemmer SnowballStemmer.languages frstemmer = SnowballStemmer('french') frstemmer.stem('manges') # ### III. LANCASTER STEMMER
#The functions of the LancasterStemmer class are just like the functions of the #PorterStemmer class, but can produce slightly different results. It is known to #be slightly more aggressive than the PorterStemmer functions: print() print("method 2") from nltk.stem import LancasterStemmer s = LancasterStemmer() print(s.stem('cooking')) print(s.stem('cookery')) #The RegexpStemmer class #You can also construct your own stemmer using the RegexpStemmer class. It takes #suffix that matches the expression: from nltk.stem import RegexpStemmer stemmer = RegexpStemmer('ed') print(stemmer.stem('cooking')) print(stemmer.stem('cookery')) print(stemmer.stem('ingleside')) #SNOWBALLSTEMMER.The SnowballStemmer class supports 15 non-English languages. It also provides two # English stemmers: the original porter algorithm as well as the new English stemming algorithm. from nltk.stem import SnowballStemmer print(SnowballStemmer.languages) print(len(SnowballStemmer.languages)) st = SnowballStemmer('english') print(st.stem('cooking')) print(st.stem('studied'))
# In[7]: import nltk from nltk.stem import LancasterStemmer stemmerlanc=LancasterStemmer() stemmerlanc.stem('darling') #doesn't work here as well # In[8]: from nltk.stem import RegexpStemmer regexpStemmer=RegexpStemmer('ing') regexpStemmer.stem('dancing') #doesn't support # In[10]: import nltk from nltk.stem import SnowballStemmer SnowballStemmer.languages frenchstemmer=SnowballStemmer('french') frenchstemmer.stem('manges') # In[11]:
print(stemmerporter.stem('cooking') == 'cook') print(stemmerporter.stem('cookery') == 'cookeri') print(stemmerporter.stem('working')) print(stemmerporter.stem('happiness')) # 3_2 stemmerlan = LancasterStemmer() print(stemmerlan.stem('cooking') == 'cook') print(stemmerlan.stem('cookery') == 'cookery') print(stemmerlan.stem('working')) print(stemmerlan.stem('happiness')) print(stemmerlan.stem('achievement')) # 3_3 stemmerregexp = RegexpStemmer('ing') print(stemmerregexp.stem('cooking') == 'cook') print(stemmerregexp.stem('cookery') == 'cookery') print(stemmerregexp.stem('ingleside') == 'leside') print(stemmerregexp.stem('working')) print(stemmerregexp.stem('happiness')) print(stemmerregexp.stem('pairing')) # 3_4 print(SnowballStemmer.languages == ('danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')) stemmerspanish = SnowballStemmer('spanish') print(stemmerspanish.stem('hola') == 'hol') print(stemmerspanish.stem('comiendo'))
def regexStemmer(self, term): v_sufixos = ['ando', 'endo', 's', 'é'] expr = 's$|es$' stemmer = RegexpStemmer(expr) return stemmer.stem(term)
# porter词干提取算法 stemerporter = PorterStemmer() print(stemerporter.stem('working')) print(stemerporter.stem('happiness')) # Lancaster词干提取算法,比porter涉及更多的情感词 from nltk.stem import LancasterStemmer stemmerlan=LancasterStemmer() print(stemmerlan.stem('working')) print(stemmerlan.stem('happiness')) # 设计自己的词干提取器 from nltk.stem import RegexpStemmer stemmerregexp=RegexpStemmer('ing') # 去除ing print(stemmerregexp.stem('working')) print(stemmerregexp.stem('happiness')) print(stemmerregexp.stem('inghappiness')) # 除英文外的14种语言的词干提取 from nltk.stem import SnowballStemmer print(SnowballStemmer.languages) # arabic(阿拉伯语) danish(丹麦语) dutch(荷兰语) finnish(芬兰语) french(法语) german(德语) # hungarian(匈牙利语) italian(意大利语) norwegian(挪威语) porter portuguese(葡萄牙语) # romanian(罗马尼亚语) russian(俄语) spanish(西班牙语) swedish(瑞典语) spanishstemmer=SnowballStemmer('spanish') print(spanishstemmer.stem('comiendo')) frenchstemmer=SnowballStemmer('french') print(frenchstemmer.stem('manger')) def oo():
# feature_set = FeatureSet( i, genres[0]['name'] ) # feature_set = FeatureSet( i, genres['name'] ) # print( genres[0]['name'] ) if feature_set._genre not in unique_genres.keys(): unique_genres.update({feature_set._genre: 1}) else: unique_genres[ feature_set._genre] = unique_genres[feature_set._genre] + 1 for word in tokenized_string: if word not in stop_words: # make plurals singular token = stemmer.stem(word) # tokens.append( token ) feature_set.incrementFrequency(token) if token not in vocabulary.keys(): # print( token ) document_list = [] vocabulary.update({token: [i]}) else: if i not in vocabulary[token]: vocabulary[token].append(i) # print( feature_set._genre ) feature_sets.append(feature_set) # print( vocabulary ) # calculate genre probability
more_stop_en = set(get_stop_words('english')) more_stop_es = set(get_stop_words('spanish')) stop_es = set(stopwords.words("spanish")) stop_en = set(stopwords.words("english")) adj = Adjectives() room = 0 #for line in var: for line in sys.stdin: room+=1 line = line.replace('á','a').replace('é','e').replace('í','i').replace('ó','o').replace('ú','u'); line = re.sub('[0-9,#,!,¡,&,.]','',line) line = re.sub('[^a-zA-Z]'," ",line) words = line.lower().split() st = RegexpStemmer('ing$|s$|able$|thing$|ful$', min=4) words = [st.stem(w) for w in words] words = [w for w in words if not w in stop_en and not w in stop_es] words = [w for w in words if not w in more_stop_en and not w in more_stop_es] words = [w for w in words if len(w) > 2] for i in words: if i not in dictionario: dictionario[i]=1 else: a = dictionario[i] dictionario.update({i:a+1}) good_count = [ words.count(ad) for ad in adj.good if ad in words] good_count = len(good_count)
# lancaster stemmer from nltk.stem import LancasterStemmer ls = LancasterStemmer() print ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped') print ls.stem('lying') print ls.stem('strange') # regex stemmer from nltk.stem import RegexpStemmer rs = RegexpStemmer('ing$|s$|ed$', min=4) print rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped') print rs.stem('lying') print rs.stem('strange') # snowball stemmer from nltk.stem import SnowballStemmer ss = SnowballStemmer("german") print 'Supported Languages:', SnowballStemmer.languages # autobahnen -> cars # autobahn -> car ss.stem('autobahnen')
sl.stem('connecting') # In[29]: sl.stem('connection') # In[30]: sl.stem('connections') # In[31]: from nltk.stem import RegexpStemmer rx = RegexpStemmer('ing') rx.stem('working') # In[32]: rx.stem('working') # In[33]: rx.stem('farming') # In[34]: rx.stem('happiness') # In[35]:
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import unicode_literals import csv from nltk.stem import RegexpStemmer if __name__ == '__main__': patterns = 'i$|t$' regexp_stemmer = RegexpStemmer(patterns, 3) result_list = list() for word in ['Péter', 'szereti', 'Enikőt', 'és', 'Marit']: stem = regexp_stemmer.stem(word) result_list.append([word, stem]) with open('output/regexp.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['word', 'stem']) for i in result_list: writer.writerow(i) print('See the result in output/regexp.csv')
print("lowcase and split : ") df['comments'].head(10) # Remove stopwords stop = stopwords.words('english') df['comments'] = df['comments'].apply( lambda x: [item for item in x if item not in stop]) print("Remove stopwords : ") df['comments'].head(10) # Stemming from nltk.stem import RegexpStemmer st = RegexpStemmer('ing$|s$|e$|able$', min=4) for x in df['comments']: for y in x: y = st.stem(y) print("Stemming : ") df['comments'].head(10) # Remove Strings which length > 3 df['comments'] = df['comments'].apply( lambda x: [item for item in x if len(item) > 3]) print("Remove Strings which length > 3 : ") df['comments'].head(10) pre_end = time.time() print("It cost %f sec" % (pre_end - pre_start)) """ Group Comments by the column of 'listing_id' """ df2 = df[['listing_id', 'comments']].copy()
stem_word_list = [ls.stem(w) for w in words_list] print(stem_word_list.count('jump')) print(stem_word_list) print(ls.stem("lying")) print(ls.stem("strange")) """ There are several other stemmers, including RegexpStemmer , where you can build your own stemmer based on user-defined rules , and SnowballStemmer , which supports stemming in 13 different languages besides English. """ #Regex Based stemmer from nltk.stem import RegexpStemmer rs = RegexpStemmer("ing$|s$|ed$", min=4) for w in words_list: print(rs.stem(w)) print(rs.stem("lying")) print(rs.stem("strange")) #Snow Ball stemmer from nltk.stem import SnowballStemmer ss = SnowballStemmer("german") print("supported languages are :", SnowballStemmer.languages) german_cars = "autobahnen" print(ss.stem(german_cars))
wh_tokenizer = WhitespaceTokenizer() wh_tokenizer.tokenize(sentence5) # 5. WordPunct Tokenizer from nltk.tokenize import WordPunctTokenizer wp_tokenizer = WordPunctTokenizer() wp_tokenizer.tokenize(sentence5) # Regexp Stemmer sentence6 = "I love playing Cricket. Cricket players practice hard." from nltk.stem import RegexpStemmer regex_stemmer = RegexpStemmer('ing$') ' '.join([regex_stemmer.stem(wd) for wd in sentence6.split()]) # Porter Stemmer sentence7 = "Before eating, it would be nice to sanitize your hands with a sanitizer" from nltk.stem.porter import PorterStemmer ps_stemmer = PorterStemmer() ' '.join([ps_stemmer.stem(wd) for wd in sentence7.split()]) # Lemmatization import nltk from nltk.stem import WordNetLemmatizer from nltk import word_tokenize nltk.download('wordnet') lemmatizer = WordNetLemmatizer()
# <h2>Stemming Words</h2> # <p>Stemming is the process of removing <em>affixes</em> from a word to obtain it's root, or <em>stem</em>. For example, the stem of <strong> # growing</strong> is <strong>grow</strong>. </p> # <p>Python includes 4 stemming algorithms, 3 of which are demonstrated below. The fourth, <em>Snowball</em> is for non-English languages # and is not covered here but is in the text </p> # <codecell> from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer porter = PorterStemmer() lancaster = LancasterStemmer() reg = RegexpStemmer('ing') g = 'growing' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g) # <markdowncell> # <p>The output of various words can be different between stemmers:</p> # <codecell> g = 'cookery' print 'Porter yields: ',porter.stem(g) print 'lancaster yields: ', lancaster.stem(g) print 'Regexp yields: ', reg.stem(g) # <markdowncell> # <h2>Lemmatizing</h2>
import nltk from nltk.stem import RegexpStemmer stemmerregexp=RegexpStemmer('ing') print(stemmerregexp.stem('working')) print(stemmerregexp.stem('happiness')) print(stemmerregexp.stem('pairing'))
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, RegexpStemmer from nltk.stem.snowball import EnglishStemmer stemmer = PorterStemmer() print(stemmer.stem('cooking')) print(stemmer.stem('cookery')) stemmer2 = LancasterStemmer() print(stemmer2.stem('cooking')) print(stemmer2.stem('cookery')) stemmer3 = SnowballStemmer('english') print(stemmer3.stem('cooking')) print(stemmer3.stem('cookery')) # english is also Porter. stemmer_en = EnglishStemmer() print(stemmer_en.stem('cooking')) print(stemmer_en.stem('cookery')) # regex stemmer_reg = RegexpStemmer('ing') print(stemmer_reg.stem('cooking')) print(stemmer_reg.stem('thing'))
def get_prescription(text): pstem = PorterStemmer() with open("symptoms.txt") as f: symptoms = f.readlines() finalsyns=[] for word in symptoms: syns = wordnet.synsets(word.strip()) syns = [s.lemma_names() for s in syns ] merged = list(itertools.chain(*syns)) if len(merged) == 0: finalsyns = finalsyns+[pstem.stem(word.strip())] ##print(finalsyns) else : finalsyns = finalsyns+merged finalsyns = [f.replace('\n','') for f in finalsyns] finalsyns = list(dict.fromkeys(finalsyns)) #print(finalsyns) pstem = PorterStemmer() rstem = RegexpStemmer('\(s\)') def words_in_string(word_list, a_string): return set(word_list).intersection(a_string) with open("Amount.txt") as f: Amount = f.readlines() Amount = [rstem.stem(x.strip()).split(' - ') for x in Amount] Amount = list(chain(*Amount)) prescription_dataset = tuple(open("dataset2.txt", 'r')) with open("Frequency.txt") as f: frequency = f.readlines() frequency = [rstem.stem(x.strip()).split(' - ') for x in frequency] frequency = list(chain(*frequency)) schedule={} with open("schedule.txt") as f: for line in f: #print(line.split(':')) s = line.split(':') schedule[s[0].strip().lower()] = s[1].strip() data = {} prescription = text.lower() prescription_tokenized = [word.replace(".","").replace("(","").replace(")","") for word in prescription.split()] prescription_tokenized_final = [pstem.stem(word) for word in prescription_tokenized ] print(prescription) data.update({'prescription': prescription}) amount="" for word in Amount: if pstem.stem(word) in prescription_tokenized_final or rstem.stem(word) in prescription_tokenized: index = prescription_tokenized_final.index(pstem.stem(word)) amount = prescription_tokenized[index-1]+" "+prescription_tokenized[index] if amount is "": print("Amount not mentioned!") else: print("Amount : " + amount) freq= "" timing = "" if "every" in prescription or "each" in prescription: if "every" in prescription: ei = prescription_tokenized.index("every") re="every" elif "each in prescription": ei = prescription_tokenized.index("each") re="each" st = ["minutes","minute","hours","hour","meal","day","days","morning","evening","afternoon"] for i in range(0,10): s=st[i] if s in prescription_tokenized: ti=prescription_tokenized.index(s) if ti-ei==2: freq=re+" "+prescription_tokenized[ei+1]+" "+s elif ti-ei==1 and i>=7: freq= re+" "+ s if schedule.get(freq.strip()) is not None: timing = schedule.get(freq.strip()) for word in frequency: if word in prescription and word.strip() is not "": freq=freq + " "+ word if schedule.get(word.strip()) is not None: timing = schedule.get(word.strip()) if freq is "": print("No Frequency mentioned!") symptoms ="" for s in finalsyns: if s in prescription: symptoms+= " "+s _check = ["", None] if freq in _check: return {'error':"No prescription found!"} data.update({"Amount":amount,"Symptoms" :symptoms,"Frequency":freq,"Timings":timing}) return data
print(ls.stem("jumping")) print(ls.stem("jumps")) print(ls.stem("jumper")) print(ls.stem("strange")) print(ls.stem("stranger")) print(ls.stem("lying")) # REGEXP STEMMER # Uses regular expressions to identify the morphological affixes in words and any part of the # string matching the same is removed # Note that this stemmer is case sensitive (won't work on capitalized affixes) rs = RegexpStemmer(r"ing$|s$|ed$", min=4) print(rs.stem("jumping")) print(rs.stem("colored")) print(rs.stem("lying")) # SNOWBALL STEMMER # Stems words in a dozen of languages. http://snowballstem.org ss = SnowballStemmer(language="german") print("Supported languages: {}".format(SnowballStemmer.languages)) print(ss.stem("autobahnen")) print(ss.stem("endlich")) print(ss.stem("unglaublich")) print(ss.stem("untergehen")) print(ss.stem("hauschen"))
""" $$ LancasterStemmer - Most Aggressive. LancasterStemmer is mostly used for the cases where the data or text is very huge, but your accuracy might falldown because of its most aggressive nature. """ ######## RegexpStemmer ########### rstemmer = RegexpStemmer('ing') ## remove all the letters except for a given word rstemmer.stem('cooking') rstemmer.stem('dancing') rstemmer.stem('king') ## as you can only k is given if we have given king... ## so should be more carefull about it. """ That's the End of Stemming concept. If you have any questions or suggestions regarding the concept, feel free to contact me via [email protected] """
def OutputRelations(abstractFileName,seta,negSet,neutralSet,negationSet,posSet,fullNames,threshold): #added threshold in input format #recent change: no longer using filename for abstract. instead, input the string of the abstract import nltk import copy import re from nltk.stem.lancaster import LancasterStemmer from nltk.stem import RegexpStemmer sentencedb = dict() fullnamestore = dict() a = readf(fullNames) for i in a: i = i.split(";") if len(i)>1: #storing the full names, using the short symbols as dict keys fullnamestore[i[0]] = i[1] else: fullnamestore[i[0]] = "none" #sentencedb indexes the sentences by a unique identifier (int) def isGene(x,t,sentence): #checks if gene 'x' in a list of tokens 't' is really a gene or a variable with the same name if len(t)>1 and len(x)>2: if t.index(x) ==0: if t[t.index(x)+1] in [">","<","=","score"]: return False elif t.index(x) ==len(t)-1: if t[t.index(x)-1] in [">","<","=","score"]: return False elif(t[t.index(x)+1] in [">","<","=","score"])or ( t[t.index(x)-1] in [">","<","=","score"]): return False elif (t[t.index(x)+1],t[t.index(x)-1])==(")","("): if x in fullnamestore: if fullnamestore[x]!="none": fullLength = len(fullnamestore[x]) #full length is length of full name if t.index(x)>len(fullnamestore[x])+2: if sentence[(t.index(x)-1-fullLength):(t.index(x)-1)]==fullnamestore[x]: return True else: return False else: return True return True else: return False def countgenes(s,geneset): #counts the number of unique genes in a sentence "s" ss=nltk.word_tokenize(s) numgenes=0 existingGenes = [] for i in ss: if i in geneset and isGene(i,ss,s) and i not in existingGenes: numgenes+=1 existingGenes.append(i) return numgenes def countWords(gene1,gene2,token): #counts the words between gene 1 and gene2 count = 0 for i in xrange(token.index(gene1)+1,token.index(gene2) -1): count+=1 return count #abstracts = open(abstractFileName,"r") storage = dict() b = [] #a=a.replace("\n"," ") #for i in a.split("\n\n"): # i=i.replace("\n"," ") # b.append(i) #print b[4] #print b[-1].split()[3] for x in abstractFileName.split("\n\n"): x=x.replace("\n"," ") b.append(x) #print x #x =x.split("\t") #print x parsedB=[] for line in b: if len(line)>0: parsedB.append(line) b=parsedB # print b sentencelist =re.split("\. (?=[A-Z])",b[-2]) sentencelistcopy=copy.deepcopy(sentencelist) l = len(sentencelist) for i in xrange(l): if countgenes(sentencelistcopy[i],seta)<2: sentencelist.remove(sentencelistcopy[i]) # print b[-1] storage[b[-1].split()[1]] = sentencelist #abstracts.close() #print sentencelistcopy,sentencelist,storage num_genes=0 bw=0 gene_names = seta st = RegexpStemmer('ing$|s$|e$|ed$|es$', min=4) def findsuf(string,x): a = "" for i in xrange(x): a+=string[len(string)-1-(x-i-1)] return a finalOutput=[] for id in storage: countsentences=0 for sentence in storage[id]: rlist = [0,0,0] #sentence = storage[id] tokens = nltk.word_tokenize(sentence) tokenscopy = copy.deepcopy(tokens) tagged = nltk.pos_tag(tokens) for x in tagged: if x[1] in ['VBP','VBN','VBZ','VBG','VB'] : tokenscopy[tagged.index(x)] = st.stem(x[0]) store=0 genes = [] #print tokens,tokenscopy relation = 2 currentlist = [] direction = 0 for x in tokens: if x in gene_names and x not in currentlist and isGene(x,tokens,sentence): genes.append(x) num_genes+=1 currentlist.append(x) #store = tokens.index(x) in1 = tokens.index(genes[0]) in2 = tokens.index(genes[1]) indexx=0 neg=1 if countWords(genes[0],genes[1],tokenscopy)<=threshold: for i in xrange(in1 +1,in2): if tokenscopy[i] in posSet: relation = 1 elif tokenscopy[i] in negSet: relation = -1 #elif tokenscopy[i] in neutralSet: #relation = 0 if (tokenscopy[i] in negSet or tokenscopy[i] in posSet): for y in xrange(in1+1,tokenscopy.index(tokenscopy[i])): if tokenscopy[y]=="not": relation =0 #2 means neutral if findsuf(tokens[i],2)=="ed": direction =1 else: direction =0 if direction ==0: rlist = [genes[0],genes[1],relation] #print genes[0],relation,genes[1] elif direction == 1 : rlist = [genes[1],genes[0],relation] #print genes[1], relation, genes[0] # if relation!="none": if True: #the above condition is so that it does not output sentences for which no relation #has been found. This makes analysis easier. Must change this during final program. sentencedb[countsentences]=sentence #use this to have the sentences represented by a number #change id to pmid finalOutput.append([id,sentence,rlist[0],rlist[1],rlist[2]]) #use this to have the actual sentences in the output #finalOutput.append([id,countsentences,rlist]) countsentences+=1 return finalOutput