def baseline(qbow, text, stopwords): # Collect all the candidate answers answers = [] qbow = set([nltk.LancasterStemmer().stem(word) for word in qbow]) qbow.update(set(lemmatizer(qbow))) print(qbow) for f in text: for sent in f: # A list of all the word tokens in the sentence sbow = get_bow(sent, stopwords) # stem all questions and sentences for better results sbow = set([nltk.LancasterStemmer().stem(word) for word in sbow]) sbow.update(set(lemmatizer(sbow))) # and then add the other print(sbow) # Count the # of overlapping words between the Q and the A # & is the set intersection operator overlap = len(qbow & sbow) print(c.OKGREEN + "overlap: " + c.ENDC + str(overlap)) answers.append((overlap, sent)) # Sort the results by the first element of the tuple (i.e., the count) # Sort answers from smallest to largest by default, so reverse it answers = sorted(answers, key=operator.itemgetter(0), reverse=True) #print(answers) # Return the best answer best_answer = (answers[0])[1] return best_answer
def main(): url = "http://www.networksciencelab.com" with urlopen(url) as doc: soup = BeautifulSoup(doc) links = [(link.string, link['href']) for link in soup.find_all('a') if link.has_attr('href')] # print(links) ls = nltk.LancasterStemmer() # word tokenize words = nltk.word_tokenize(soup.text) # lowercase words = [w.lower() for w in words] # remove stopwords words = [ ls.stem(w) for w in words if w not in stopwords.words('english') and w.isalnum() ] # count frequency freqs = Counter(words) print(freqs.most_common(10))
def stem(tokens): porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() for t in tokens: if t not in stopwords: return (porter.stem(t))
def basics(file): f = open(file) raw = f.read() # The amount of chars in the texts print "chars=>%(len_raw)s" % {"len_raw": len(raw)} tokens = nltk.word_tokenize(raw) # The amount of words in the texts print "len_tokens=>%(len_tokens)s" % {"len_tokens": len(tokens)} # The number of sentences in the texts sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(raw) print "sentences=>%(len_sents)s" % {"len_sents": len(sents)} porter = nltk.PorterStemmer() porter_stems = [porter.stem(t) for t in tokens] print "porter_stems=>%(len_stems)s" % {"len_stems": len(porter_stems)} lancaster = nltk.LancasterStemmer() lancaster_stems = [lancaster.stem(t) for t in tokens] print "lancaster_stems=>%(len_stems)s" % { "len_stems": len(lancaster_stems) } wnl = nltk.WordNetLemmatizer() wnl_stems = [wnl.lemmatize(t) for t in tokens] print "WNL stems,%(len_stems)s" % {"len_stems": len(wnl_stems)}
def preproc_txt(doc, stemm): ''' Returns a string processed by removing a stopwords and words with the length less than three character. Also, it can tokenize/lemmatize by using Porter and Lancaster algorithms and the Wordnet lemmatizer ''' tokens = nltk.word_tokenize(doc) stpw = [ word for word in tokens if word not in stopwords.words('english') and len(word) > 3 ] if stemm == 1: lemma = nltk.WordNetLemmatizer() stmw = [lemma.lemmatize(word) for word in stpw] text = nltk.Text(stmw) elif stemm == 2: stemmer = nltk.PorterStemmer() stmw = [stemmer.stem(word) for word in stpw] text = nltk.Text(stmw) elif stemm == 3: stemmer = nltk.LancasterStemmer() stmw = [stemmer.stem(word) for word in stpw] text = nltk.Text(stmw) else: text = nltk.Text(stpw) pproc_txt = ' '.join(text) return pproc_txt
def tweet_obrabiarka(tweet, hashowac, stemmer): """ This function is to use outside of modul, it gives complex tweet preprocessing. :param tweet: one tweet as a String :param hashowac: parameter which indidate will of using hashtags or no: "0" - no hashtags will be taking into account, "1" hashtags will be treated as normal words :param stemmer: parameter which let you choose type of stemmers: "1" - Porter Stemmer (less agressive, words more like natural words but not always), "2" - Lancaster Stemer (more agressive, words less like natural) words, "3" - lemmatization (words like normal words, but more time-consuming) :return: """ tt = tweet_tokenizator(tweet) rp = remove_punctuation(tt, hashowac) rs = remover_stopwords(rp) if stemmer == 1: ps = nl.PorterStemmer() if stemmer == 2: ps = nl.LancasterStemmer() if stemmer == -1: ps = nl.WordNetLemmatizer() output_list = [] for word in rs: word = word.lower() if stemmer > 0: word = ps.stem(word) if stemmer < 0: word = ps.lemmatize(word) output_list.append(word) return output_list
def descendantofWords(): words = filterTokenWords() lancaster = nltk.LancasterStemmer() descwords = [] for w in words: descwords.append(lancaster.stem(w)) return descwords
def stemmer(tokens): """ 文档词干化 """ st = nltk.LancasterStemmer() tokens = [st.stem(word) for word in tokens] return tokens
def stem_words(words): stemmer = nltk.LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def newstemmer(words, stemmer, threshold = 20): """A stemmer that uses Lancaster/porter stemmer plus a dictionary.""" import pickle import os import nltk bncwordlist = pickle.load(open('spindle-code-master/keywords/bnc.p', 'rb')) bnc_commonwords = {k for (k,v) in bncwordlist.iteritems() if v > threshold} # if words is a raw string, tokenise it if type(words) == unicode or type(words) == string: tokens = nltk.word_tokenize(words) # or, if list of tokens, duplicate the list else: tokens = words # define stemmer based on the argument we passed in if stemmer == 'Lancaster': stemmertouse = nltk.LancasterStemmer() if stemmer == 'Porter': stemmertouse = nltk.PorterStemmer() # empty list of stems stems = [] for w in tokens: # stem each word stem = stemmertouse.stem(w) # if the stem is in the bnc list if stem in bnc_commonwords: # add the stem stems.append(stem) else: # or else, just add the word as it was stems.append(w) return stems
def stem(): porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() [porter.stem(t) for t in tokens] [lancaster.stem(t) for t in tokens]
def __init__(self, pattern='\W+', lower=False, stem=False, stemmer_name='porter', pos=False, ngram=1): # RE pattern used in tokenization self.pattern = pattern # Ngram: Default = 1 self.ngram = int(ngram) # Convert terms to lower case self.lower = lower # Ignore PoS and Stemmers if NLTK not installed if not my_nltk: self.pos = False self.stem = False else: self.pos = pos self.stem = stem self.stemmer_name = stemmer_name self.stemmers = { 'lancaster': my_nltk.LancasterStemmer(), 'porter': my_nltk.PorterStemmer() } self.frequent_terms = []
def show_word_in_context2(target_word, text, context_size=5): """ Better concordance searching tool: stemmer used and punctuation removed """ stemmer = nltk.LancasterStemmer() # Target word pre-processing target_stem = stemmer.stem(target_word.lower()) # Text pre-processing text = text.lower() for punct in punctuation: text = text.replace(punct, " ") # Make a bag of words, retaining order words = nltk.word_tokenize(text) # Search and print text_parts = [] for word_num, word in enumerate(words): if stemmer.stem(word) == target_stem: start = max(word_num - context_size, 0) stop = word_num + context_size + 1 text_parts.append(words[start:stop]) return text_parts
def exercise30(): words = nltk.word_tokenize(SimpleText) porter = nltk.PorterStemmer() porter = [porter.stem(word) for word in words] lancaster = nltk.LancasterStemmer() lancaster = [lancaster.stem(word) for word in words] print("words in lancaster not in porter : ", set(lancaster) - set(porter)) print("words in porter not in lancaster : ", set(porter) - set(lancaster))
def stem_words(words): """Stem words in list of tokenized words""" stemmer = nltk.LancasterStemmer() stems = [] for word in words: stem = stemmer.stem(word) stems.append(stem) return stems
def lancasterStemmer(self, txtTokens): """ Use Lancaster stemmer to stem a text @params txtTokens: the tokens of the text @rtype: {List} """ lancaster = nltk.LancasterStemmer() [lancaster.stem(t) for t in tokens]
def get_stem(): """ The Porter Stemmer is a good choice if you are indexing some texts and want to support search using alternative forms of words """ porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() print([porter.stem(t) for t in tokens]) print([lancaster.stem(t) for t in tokens])
def get_stemming(self, type): if not self.__stemming_list: self.__stemming_list = { 'rslps': nltk.stem.RSLPStemmer(), 'porter': nltk.PorterStemmer(), 'lancaster': nltk.LancasterStemmer(), 'english': nltk.stem.snowball.EnglishStemmer(), 'portuguese': nltk.stem.snowball.PortugueseStemmer() } return self.__stemming_list[type]
def ch03_30_porter_vs_lancaster(): porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() tokens = [ "When", "all", "is", "said", "and", "done", ",", "more", "is", "said", "than", "done", "." ] print "porter=", [porter.stem(w.lower()) for w in tokens] print "lancaster=", [lancaster.stem(w.lower()) for w in tokens] print "len(tokens)=", map(lambda token: len(token), tokens)
def exercise30(): # 3. exercise 30. In this question, consider SimpleText for reporting your results. # ◑ Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. Do the same thing with the Lancaster Stemmer and see if you observe any differences. tokens = nltk.word_tokenize(SimpleText) porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() porter_list = [porter.stem(t) for t in tokens] lancaster_list = [lancaster.stem(t) for t in tokens] print("porter: ", porter_list) print("lancaster: ", lancaster_list)
def filter_funcs_for_context(ctx): filter_funcs = [] if ctx.enable_casefolding: filter_funcs.append(CaseFolder().call) if ctx.enable_stopwords: filter_funcs.append(StopWordFilter(BASE_STOPWORDS).call) if ctx.remove_nonalphanumeric: filter_funcs.append(AlphaNumericFilter().call) if ctx.enable_stemming: filter_funcs.append(Stemmer(nltk.LancasterStemmer()).call) return filter_funcs
def word_stems(string, stemmer="lancaster"): words = get_words(string) if stemmer == "lancaster": lancaster = nltk.LancasterStemmer() return [lancaster.stem(t) for t in words] elif stemmer == "porter": porter = nltk.PorterStemmer() return [porter.stem(t) for t in words] else: print( "Stemmer '%' not recognized... Try using 'lancaster' or 'porter'.")
def stem(self, Porter=1 == 1): ''' Function: Stem all tokens. Porter: If true Porter stemmer is used, if not Lancaster stemmer is used. ''' self.backup = self.lst_tk_lsts import nltk stemmer = nltk.LancasterStemmer() if Porter: stemmer = nltk.PorterStemmer() self.lst_tk_lsts = [[stemmer.stem(t) for t in l if type(t) == str] for l in self.lst_tk_lsts]
def fun2(): # NLTK中包括了一些现成的词干提取器,如果需要使用词干提取器,应该优先使用它们中的一个,而不 # 是使用正则表达式制作自己的词干提取器,因为NLTK中的词干提取器能处理的不规则情况很广泛。Porter # 和Lancaster词干提取器按照它们自己的规则剥离词缀。 raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government. Supreme executive power derives from a mandate from the masses, not from some farcical aquatic ceremony.""" tokens = nltk.word_tokenize(raw) porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() print[porter.stem(t) for t in tokens] print[lancaster.stem(t) for t in tokens]
def do_normalization(text): stemmer = nltk.LancasterStemmer() lemmatizer = nltk.WordNetLemmatizer() # text = text.lower() text = ' '.join([stemmer.stem(s) for s in text.split(' ')]) text = ' '.join([lemmatizer.lemmatize(s) for s in text.split(' ')]) text = remove_punctuations(text) return text
def stemming(tokenized_text, Stemmer): ''' Performs stemming with either Porter or Lancaster Stemmer ''' if Stemmer == 'Porter': ps = nltk.PorterStemmer() elif Stemmer == 'Lancaster': ps = nltk.LancasterStemmer() text = [ps.stem(word) for word in tokenized_text] return text
def do_normalization(text): stemmer = nltk.LancasterStemmer() lemmatizer = nltk.WordNetLemmatizer() # removing this line improve the performance of the classifier after normalization text = text.lower() text = ' '.join([stemmer.stem(s) for s in text.split(' ')]) text = ' '.join([lemmatizer.lemmatize(s) for s in text.split(' ')]) text = remove_punctuations(text) return text
def preprocess(text): """ Tokenize -> Normalize -> Stemming -> Stopping :param text: String, sentence :return: Returns preprocessed text as a list """ text = text.translate(str.maketrans('', '', string.punctuation)).lower() words = nltk.word_tokenize(text) words = [nltk.LancasterStemmer().stem(word) for word in words] words = [word for word in words if word not in stopwords.words('english')] return words
def q_thirty(): raw = "For god's sake, this is way too difficult. I need hints many, many hints. Argh!" tokens = nltk.word_tokenize(raw) porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() print "PorterStemmer :" for t in tokens: print porter.stem(t), print "\nLancasterStemmer :" for t in tokens: print lancaster.stem(t), print """
def search_answer(self, cnstrd_word_syn, wd_in_sent, key_wd_idx): """ This function searches the constrainted word of the question Parameters: cnstrd_word_syn (list) - the list of the synonyms of the constrainted word in the question wd_in_sent (list) - word tokenization text key_wd_idx (int) - the position of the key word in the sentence return: the position of the constrainted word in the sentence of the text """ porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() #print cnstrd_word_syn for cw in cnstrd_word_syn: cw_seperate = [] if '_' in cw: cw1 = cw.split('_')[0] cw2 = cw.split('_')[1] cw_seperate = [cw1, cw2] cw = ' '.join(cw.split('_')) cw_seperate.append(cw) #print(cw) for sent in wd_in_sent[key_wd_idx:]: #print(cw) #print(cw, sent) #print sent """ if cw_seperate: for c_s in cw_seperate: if porter.stem(c_s.lower()) == porter.stem(sent.lower()) or lemma(c_s) == lemma(sent): #or sent.lower() in cw.lower() or lemma(): print("!!!!!!!!") print(cw, sent) print(wd_in_sent.index(sent)) return wd_in_sent.index(sent) """ if porter.stem(cw.lower()) == porter.stem( sent.lower()) or lemma(cw) == lemma( sent): #or sent.lower() in cw.lower() or lemma(): #print("!!!!!!!!") #print(cw, sent) #print(wd_in_sent.index(sent)) return wd_in_sent.index(sent) """ elif cw_seperate: for cw_s in cw_seperate: if porter.stem(cw.lower()) == porter.stem(sent.lower()) or lemma(cw) == lemma(sent): return wd_in_sent.index(sent) """ return None