def getTokens(texts): #global corpusTokens #global docsTokens stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() allTokens = [] #tokens=[] if type(texts) != type([]): texts = [texts] for s in texts: #toks = nltk.word_tokenize(s.lower()) toks = tokenizer.tokenize(s) allTokens.extend(toks) #corpusTokens.extend(toks) #docsTokens.append(toks) allTokens_2 = [ t.lower() for t in allTokens if len(t) > 2 and t.isalnum() and t not in stopwordsList ] #allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()] #allTokens_stw = [t3 for t3 in allTokens if t3 not in stopwordsList] #allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList] allTokens_stem = [stemmer.stem(word) for word in allTokens_2] #allTokens_stw] final = [t for t in allTokens_stem if t not in stopwordsList] return final
def get_words(klass, text): tokenizer = WordPunctTokenizer() words = Counter() for word in tokenizer.tokenize(text): word = word.lower() if len(word) > 2 and not word in STOPWORDS: words[word] += 1 return words
def getEntity(self, word, originalcase): tokenizer = WordPunctTokenizer() for sentence in self.raw_sentences: words = tokenizer.tokenize(sentence.strip()) if originalcase in words: entity = self.getEntityFromSentence(originalcase, words) if len(entity) > 1: return " ".join(entity) return originalcase
def getDocTokens(docText): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() docTokens = tokenizer.tokenize(docText) allTokens_2 = [t.lower() for t in docTokens if len(t)>2] allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()] allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList] allTokens_stem = [stemmer.stem(word) for word in allTokens_stw] final = [t for t in allTokens_stem if t not in stopwordsList] return final
def getTokens(doc): #global stemmer stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') webpage = open(doc,'r').read() text = nltk.clean_html(webpage) #tokens = nltk.word_tokenize(text) tokens = tokenizer.tokenize(text) clean = [token.lower() for token in tokens if token.lower() not in stopwords] final = [stemmer.stem(word) for word in clean] return final
def getTokenizedDocs(docs): docs_tokens=[] stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') for text in docs: tokens = tokenizer.tokenize(text) clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords] clean = [token for token in clean if len(token) > 2] final = [stemmer.stem(word) for word in clean] docs_tokens.append(final) return docs_tokens
class TrollAnalyzer: def __init__(self, model): self.model = model self.tokenizer = WordPunctTokenizer() self.scale_score = lambda k: k * 2 - 1 def analyze(self, sentence): words = self.tokenizer.tokenize(sentence) predictions = { word: self.scale_score(self.model.predict_proba([word])[0][1]) for word in words } total = self.scale_score(self.model.predict_proba([sentence])[0][1]) return {"master": total, "tokenized": predictions}
def convertDoctoTFIDF(self,doc): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords_e = stopwords.words('english') tokens = tokenizer.tokenize(doc) clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords_e] clean = [token for token in clean if len(token) > 2] final_doc = [stemmer.stem(word) for word in clean] doc_tfidf=[] words = self.model[1] for i in range(0,len(words)): tf = final_doc.count(words[i]) doc_tfidf.append((tf,words[i])) return doc_tfidf
def convertDoctoTFIDF(self,doc): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords_e = stopwords.words('english') tokens = tokenizer.tokenize(doc) clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords_e] clean = [token for token in clean if len(token) > 2] final_doc = [stemmer.stem(word) for word in clean] doc_tfidf=[] words = self.model[1] for i in range(0,len(words)): tf = final_doc.count(words[i]) doc_tfidf.append((tf,words[i])) return doc_tfidf
def getTokenizedDoc(doc): #docs_tokens=[] stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"]) stopwords.extend(["com","http","retweet","tweet","twitter"]) tokens = tokenizer.tokenize(doc) tokens = [token for token in tokens if len(token) > 2] clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords] #clean = [token for token in clean if len(token) > 2] final = [stemmer.stem(word) for word in clean] final = [t for t in final if t not in stopwords] #docs_tokens.append(final) return final
def get_texts(self): """ Parse documents from the .cor file provided in the constructor. Lowercase each document and ignore some stopwords. .cor format: one document per line, words separated by whitespace. """ tokenizer = WordPunctTokenizer() #print CorpusCSV.stoplist #return self.getstream() for doc in self.getstream(): #yield [word for word in doc.lower().split()] #if word not in CorpusMiislita.stoplist] #yield doc yield [word for word in tokenizer.tokenize(doc.lower()) if word_ok(word)]
def extract_words(self, wid): """Updates db with previously unseen words and lemmas, and page unigrams""" words_file = gzip.open(self.words_file, 'a') page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w') w = WordPunctTokenizer() qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'}) print 'starting extraction for wiki %s...' % str(wid) for doc in qi: print 'extracting words for %s...' % doc['id'] page_file.write('\t%s\n' % doc['pageid']) for word in w.tokenize(doc.get('html_en', '').lower()): if word not in self.words: self.words[word] = self.counter words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8'))) self.counter += 1 page_file.write('%i\n' % self.words.get(word, 0)) page_file.close() words_file.close()
def getTokenizedDoc(doc): #docs_tokens=[] stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"]) stopwords.extend(["com", "http", "retweet", "tweet", "twitter"]) tokens = tokenizer.tokenize(doc) tokens = [token for token in tokens if len(token) > 2] clean = [token for token in tokens if token.isalnum()] clean = [ token.lower() for token in clean if token.lower() not in stopwords ] #clean = [token for token in clean if len(token) > 2] final = [stemmer.stem(word) for word in clean] final = [t for t in final if t not in stopwords] #docs_tokens.append(final) return final
def rouge(candidate, reference, n=2, verbose=False): """This is a basic implementation of ROUGE-N. It calculates the n-gram recall of a candidate summary against a refrence summary. """ tokenizer = WordPunctTokenizer() candidate = tokenizer.tokenize(candidate.lower()) reference = tokenizer.tokenize(reference.lower()) c_ngrams = set(ngrams(candidate, n)) r_ngrams = set(ngrams(reference, n)) cr_ngrams = [g for g in c_ngrams if g in r_ngrams] rouge_n = len(cr_ngrams) / len(r_ngrams) if verbose: print("{:d} matching {:d}-grams out of {:d} total.".format( len(cr_ngrams), n, len(r_ngrams))) print(cr_ngrams) print("ROUGE-{:d}: {:0.3f}".format(n, rouge_n)) return rouge_n
class Scorer(object): #docs =[] #docs_length=[] def __init__(self): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.keywords = [] self.score = 0 def cleanDoc(self, doc): tokens = self.tokenizer.tokenize(doc) clean = [ token.lower() for token in tokens if token.lower() not in self.stopwords and len(token) > 2 ] final = [self.stemmer.stem(word) for word in clean] return final #def __init__(self,seedUrls): def __init__(self, keywords): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.score = 0 self.keywords = keywords #self.keywords = [] '''for url in seedUrls: page = Webpage(url) data = self.cleanDoc(page.text) for d in data: self.keywords.append(d)''' # this function checks if the url words contain the keywords or not. # the score given is calculated by finding how many keywords occur in the url. def calculate_score(self, url): words = url.getAllText().split() for w in self.keywords: if w in words: self.score += 1 self.score = self.score / float(len(self.keywords)) return self.score
class Scorer(object): #docs =[] #docs_length=[] def __init__(self): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.keywords = [] self.score = 0 def cleanDoc(self,doc): tokens = self.tokenizer.tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in self.stopwords and len(token) > 2] final = [self.stemmer.stem(word) for word in clean] return final #def __init__(self,seedUrls): def __init__(self,keywords): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.score = 0 self.keywords = keywords #self.keywords = [] '''for url in seedUrls: page = Webpage(url) data = self.cleanDoc(page.text) for d in data: self.keywords.append(d)''' # this function checks if the url words contain the keywords or not. # the score given is calculated by finding how many keywords occur in the url. def calculate_score(self,url): words = url.getAllText().split() for w in self.keywords: if w in words: self.score +=1 self.score = self.score / float(len(self.keywords)) return self.score
def getTokens(texts): #global corpusTokens #global docsTokens stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() allTokens=[] #tokens=[] if type(texts) != type([]): texts = [texts] for s in texts: #toks = nltk.word_tokenize(s.lower()) toks = tokenizer.tokenize(s) allTokens.extend(toks) #corpusTokens.extend(toks) #docsTokens.append(toks) allTokens_2 = [t.lower() for t in allTokens if len(t)>2] allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()] allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList] allTokens_stem = [stemmer.stem(word) for word in allTokens_stw] final = [t for t in allTokens_stem if t not in stopwordsList] return final
def document_as_words(document): """ There is probably an NLTK function somewhere that does already, but I couldn't find it. So this just converts a single document into a list of words which you can then use with the rest of these functions, to get a feature list which you can then classify. """ stringbuf = StringIO.StringIO(document) word_tokenizer = WordPunctTokenizer() para_tokenizer = read_blankline_block sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') words = [] for para in para_tokenizer(stringbuf): for sent in sent_tokenizer.tokenize(para): for word in word_tokenizer.tokenize(sent): words.append(word) return words
def stem_text(text): tokenizer = WordPunctTokenizer() stemmer = SnowballStemmer('french') liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)] return ' '.join(liste_racines)
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() get_authors_query = """ SELECT name, long_name FROM author """ get_comments_query = """ SELECT id, content FROM comment """ cursor = connections[database].cursor() tokenizer = WordPunctTokenizer() self.stdout.write("Grabbing authors...") authors = [] cursor.execute(get_authors_query) self.pbar_setup(maxval=cursor.rowcount) for row in dictfetch(cursor): authors.append(row['name']) if row['long_name'] != "": authors.append(row['long_name']) self.pbar_increment() self.pbar_destroy() self.stdout.write("Sorting authors...") authors.sort() self.stdout.write("Determining real parents...") real_parents = [] cursor1 = connections[database].cursor() cursor1.execute(get_comments_query) self.pbar_setup(maxval=cursor1.rowcount) for row in dictfetch(cursor1): tokens = tokenizer.tokenize(row['content']) if len(tokens) > 0: if tokens[0] == '@': real_parents.append(int(row['id'])) else: i = bisect_left(authors, tokens[0]) if i != len(authors) and authors[i] == tokens[0]: real_parents.append(int(row['id'])) self.pbar_increment() self.pbar_destroy() self.stdout.write("Non-Real-parents found: {}".format( len(real_parents))) cursor2 = connections[database].cursor() update_query = """ UPDATE comment SET real_parent = (CASE WHEN id in ({}) THEN 0 ELSE 1 END) """.format(('%s,' * len(real_parents)).rstrip(',')) cursor2.execute(update_query, real_parents) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
#!/usr/bin/env python import sys from nltk.tokenize.regexp import WordPunctTokenizer filename = sys.argv[1] with open(filename, 'r') as f: contents = f.read() from nltk.tokenize import wordpunct_tokenize wptk = WordPunctTokenizer() tokenized = wptk.tokenize(contents) lower_list = [] for word in tokenized: l = word.lower() lower_list.append(l) from collections import Counter counts = Counter(lower_list) #counts_list = sorted(lower_list, key=lambda x: (counts[x], x), reverse=True) print(counts)