def getTokens(texts): #global corpusTokens #global docsTokens stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() allTokens = [] #tokens=[] if type(texts) != type([]): texts = [texts] for s in texts: #toks = nltk.word_tokenize(s.lower()) toks = tokenizer.tokenize(s) allTokens.extend(toks) #corpusTokens.extend(toks) #docsTokens.append(toks) allTokens_2 = [ t.lower() for t in allTokens if len(t) > 2 and t.isalnum() and t not in stopwordsList ] #allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()] #allTokens_stw = [t3 for t3 in allTokens if t3 not in stopwordsList] #allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList] allTokens_stem = [stemmer.stem(word) for word in allTokens_2] #allTokens_stw] final = [t for t in allTokens_stem if t not in stopwordsList] return final
def get_words(klass, text): tokenizer = WordPunctTokenizer() words = Counter() for word in tokenizer.tokenize(text): word = word.lower() if len(word) > 2 and not word in STOPWORDS: words[word] += 1 return words
def __init__(self, keywords): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.score = 0 self.keywords = keywords #self.keywords = [] '''for url in seedUrls:
def getEntity(self, word, originalcase): tokenizer = WordPunctTokenizer() for sentence in self.raw_sentences: words = tokenizer.tokenize(sentence.strip()) if originalcase in words: entity = self.getEntityFromSentence(originalcase, words) if len(entity) > 1: return " ".join(entity) return originalcase
def getDocTokens(docText): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() docTokens = tokenizer.tokenize(docText) allTokens_2 = [t.lower() for t in docTokens if len(t)>2] allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()] allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList] allTokens_stem = [stemmer.stem(word) for word in allTokens_stw] final = [t for t in allTokens_stem if t not in stopwordsList] return final
def getTokenizedDocs(docs): docs_tokens=[] stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') for text in docs: tokens = tokenizer.tokenize(text) clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords] clean = [token for token in clean if len(token) > 2] final = [stemmer.stem(word) for word in clean] docs_tokens.append(final) return docs_tokens
def getTokens(doc): #global stemmer stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') webpage = open(doc,'r').read() text = nltk.clean_html(webpage) #tokens = nltk.word_tokenize(text) tokens = tokenizer.tokenize(text) clean = [token.lower() for token in tokens if token.lower() not in stopwords] final = [stemmer.stem(word) for word in clean] return final
def convertDoctoTFIDF(self,doc): stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords_e = stopwords.words('english') tokens = tokenizer.tokenize(doc) clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords_e] clean = [token for token in clean if len(token) > 2] final_doc = [stemmer.stem(word) for word in clean] doc_tfidf=[] words = self.model[1] for i in range(0,len(words)): tf = final_doc.count(words[i]) doc_tfidf.append((tf,words[i])) return doc_tfidf
def __init__(self, data_dir, data_name, split_size, max_vocab_size, max_enc_utt_len, max_dec_word_len, line_threshold): """" :param line_thres: how many line will be merged as encoding sentensce :param split_size: size of training:valid:test """ self._data_dir = data_dir self._data_name = data_name self._cache_dir = os.path.join(data_dir, "utt_seq_split") if not os.path.exists(self._cache_dir): os.mkdir(self._cache_dir) self.tokenizer = WordPunctTokenizer().tokenize self.split_size = split_size self.max_vocab_size = max_vocab_size self.max_enc_utt_len = max_enc_utt_len self.max_dec_word_len = max_dec_word_len self.line_threshold = line_threshold utt_features = self.load_data() if utt_features is None: with open(os.path.join(data_dir, data_name), "rb") as f: utt_features = self._parse_file(f.readlines()) self._create_corpus(utt_features, split_size) # clip train_y. Different word2seq, encoder don't need clipping, since it fixed history # self.train_y = self.clip_to_max_len(self.train_y) # self.valid_y = self.clip_to_max_len(self.valid_y) # self.test_y = self.clip_to_max_len(self.test_y) # get vocabulary\ self.vocab = self.load_vocab("vocab.txt")
def get_texts(self): """ Parse documents from the .cor file provided in the constructor. Lowercase each document and ignore some stopwords. .cor format: one document per line, words separated by whitespace. """ tokenizer = WordPunctTokenizer() #print CorpusCSV.stoplist #return self.getstream() for doc in self.getstream(): #yield [word for word in doc.lower().split()] #if word not in CorpusMiislita.stoplist] #yield doc yield [word for word in tokenizer.tokenize(doc.lower()) if word_ok(word)]
def getTokenizedDoc(doc): #docs_tokens=[] stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"]) stopwords.extend(["com","http","retweet","tweet","twitter"]) tokens = tokenizer.tokenize(doc) tokens = [token for token in tokens if len(token) > 2] clean = [token for token in tokens if token.isalnum()] clean = [token.lower() for token in clean if token.lower() not in stopwords] #clean = [token for token in clean if len(token) > 2] final = [stemmer.stem(word) for word in clean] final = [t for t in final if t not in stopwords] #docs_tokens.append(final) return final
def __init__(self,keywords): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.score = 0 self.keywords = keywords #self.keywords = [] '''for url in seedUrls:
def cleanDoc(doc): stopset = stop_words stemmer = nltk.PorterStemmer() lemmatizer = nltk.WordNetLemmatizer() tokens = WordPunctTokenizer().tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2] stemmed = [stemmer.stem(word) for word in clean] final = [lemmatizer.lemmatize(word) for word in stemmed] return final
def getTokenizedDoc(doc): #docs_tokens=[] stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() stopwords = nltk.corpus.stopwords.words('english') #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"]) stopwords.extend(["com", "http", "retweet", "tweet", "twitter"]) tokens = tokenizer.tokenize(doc) tokens = [token for token in tokens if len(token) > 2] clean = [token for token in tokens if token.isalnum()] clean = [ token.lower() for token in clean if token.lower() not in stopwords ] #clean = [token for token in clean if len(token) > 2] final = [stemmer.stem(word) for word in clean] final = [t for t in final if t not in stopwords] #docs_tokens.append(final) return final
def extract_words(self, wid): """Updates db with previously unseen words and lemmas, and page unigrams""" words_file = gzip.open(self.words_file, 'a') page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w') w = WordPunctTokenizer() qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'}) print 'starting extraction for wiki %s...' % str(wid) for doc in qi: print 'extracting words for %s...' % doc['id'] page_file.write('\t%s\n' % doc['pageid']) for word in w.tokenize(doc.get('html_en', '').lower()): if word not in self.words: self.words[word] = self.counter words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8'))) self.counter += 1 page_file.write('%i\n' % self.words.get(word, 0)) page_file.close() words_file.close()
def rouge(candidate, reference, n=2, verbose=False): """This is a basic implementation of ROUGE-N. It calculates the n-gram recall of a candidate summary against a refrence summary. """ tokenizer = WordPunctTokenizer() candidate = tokenizer.tokenize(candidate.lower()) reference = tokenizer.tokenize(reference.lower()) c_ngrams = set(ngrams(candidate, n)) r_ngrams = set(ngrams(reference, n)) cr_ngrams = [g for g in c_ngrams if g in r_ngrams] rouge_n = len(cr_ngrams) / len(r_ngrams) if verbose: print("{:d} matching {:d}-grams out of {:d} total.".format( len(cr_ngrams), n, len(r_ngrams))) print(cr_ngrams) print("ROUGE-{:d}: {:0.3f}".format(n, rouge_n)) return rouge_n
def getTokens(texts): #global corpusTokens #global docsTokens stemmer = PorterStemmer() tokenizer = WordPunctTokenizer() allTokens=[] #tokens=[] if type(texts) != type([]): texts = [texts] for s in texts: #toks = nltk.word_tokenize(s.lower()) toks = tokenizer.tokenize(s) allTokens.extend(toks) #corpusTokens.extend(toks) #docsTokens.append(toks) allTokens_2 = [t.lower() for t in allTokens if len(t)>2] allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()] allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList] allTokens_stem = [stemmer.stem(word) for word in allTokens_stw] final = [t for t in allTokens_stem if t not in stopwordsList] return final
def document_as_words(document): """ There is probably an NLTK function somewhere that does already, but I couldn't find it. So this just converts a single document into a list of words which you can then use with the rest of these functions, to get a feature list which you can then classify. """ stringbuf = StringIO.StringIO(document) word_tokenizer = WordPunctTokenizer() para_tokenizer = read_blankline_block sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') words = [] for para in para_tokenizer(stringbuf): for sent in sent_tokenizer.tokenize(para): for word in word_tokenizer.tokenize(sent): words.append(word) return words
class TrollAnalyzer: def __init__(self, model): self.model = model self.tokenizer = WordPunctTokenizer() self.scale_score = lambda k: k * 2 - 1 def analyze(self, sentence): words = self.tokenizer.tokenize(sentence) predictions = { word: self.scale_score(self.model.predict_proba([word])[0][1]) for word in words } total = self.scale_score(self.model.predict_proba([sentence])[0][1]) return {"master": total, "tokenized": predictions}
def get_nouns(raw_text, site): """ Returns a list of all the nouns or noun phrases found in the given text. """ nouns = [] try: cleaned_text = format_text_for_NER(raw_text, site) text_tokens = WordPunctTokenizer().tokenize(cleaned_text) for token_and_POS in nltk.pos_tag(text_tokens): try: POS = token_and_POS[1] if 'NN' == POS or 'NNS' == POS or 'NNP' == POS or 'NNPS' == POS or 'NP' == POS: nouns.append(token_and_POS[0]) except: continue except: return nouns return nouns
def format_doc_for_sim_scoring(raw_doc): """ Tokenizes and filters/formats the words in the given document to be used during similarity measurement. This method should be used both when a doc goes into the corpus and when a doc is being compared to another doc for similarity. @return: a list of tokens """ stopset = set(stopwords.words('english')) stemmer = nltk.PorterStemmer() tokens = WordPunctTokenizer().tokenize(raw_doc) non_punct = [ ''.join(ch for ch in token if not ch in string.punctuation) for token in tokens ] # remove tokens that are purely punctuation clean_tokens = [ token.lower() for token in non_punct if token.lower() not in stopset and len(token) > 2 ] stemmed_tokens = [stemmer.stem(word) for word in clean_tokens] return ' '.join(stemmed_tokens).decode('latin-1')
class Scorer(object): #docs =[] #docs_length=[] def __init__(self): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.keywords = [] self.score = 0 def cleanDoc(self, doc): tokens = self.tokenizer.tokenize(doc) clean = [ token.lower() for token in tokens if token.lower() not in self.stopwords and len(token) > 2 ] final = [self.stemmer.stem(word) for word in clean] return final #def __init__(self,seedUrls): def __init__(self, keywords): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.score = 0 self.keywords = keywords #self.keywords = [] '''for url in seedUrls: page = Webpage(url) data = self.cleanDoc(page.text) for d in data: self.keywords.append(d)''' # this function checks if the url words contain the keywords or not. # the score given is calculated by finding how many keywords occur in the url. def calculate_score(self, url): words = url.getAllText().split() for w in self.keywords: if w in words: self.score += 1 self.score = self.score / float(len(self.keywords)) return self.score
def __init__(self, data_dir, data_name, split_size, max_vocab_size, max_enc_len, max_dec_len, line_thres): """" :param line_thres: how many line will be merged as encoding sentensce :param split_size: size of training:valid:test """ self._data_dir = data_dir self._data_name = data_name self._cache_dir = os.path.join( data_dir, data_name.replace(".txt", "_") + "word_seq_split") if not os.path.exists(self._cache_dir): os.mkdir(self._cache_dir) self.tokenizer = WordPunctTokenizer().tokenize self.line_threshold = line_thres self.split_size = split_size self.max_vocab_size = max_vocab_size self.max_enc_len = max_enc_len self.max_dec_len = max_dec_len # try to load from existing file if not self.load_data(): with open(os.path.join(data_dir, data_name), "rb") as f: self._parse_file(f.readlines(), split_size) # clip data self.train_x, self.train_y = self.clip_to_max_len( self.train_x, self.train_y) self.valid_x, self.valid_y = self.clip_to_max_len( self.valid_x, self.valid_y) self.test_x, self.test_y = self.clip_to_max_len( self.test_x, self.test_y) # get vocabulary\ self.vocab = self.get_vocab() self.print_stats("TRAIN", self.train_x, self.train_y) self.print_stats("VALID", self.valid_x, self.valid_y) self.print_stats("TEST", self.test_x, self.test_y)
class Scorer(object): #docs =[] #docs_length=[] def __init__(self): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.keywords = [] self.score = 0 def cleanDoc(self,doc): tokens = self.tokenizer.tokenize(doc) clean = [token.lower() for token in tokens if token.lower() not in self.stopwords and len(token) > 2] final = [self.stemmer.stem(word) for word in clean] return final #def __init__(self,seedUrls): def __init__(self,keywords): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.score = 0 self.keywords = keywords #self.keywords = [] '''for url in seedUrls: page = Webpage(url) data = self.cleanDoc(page.text) for d in data: self.keywords.append(d)''' # this function checks if the url words contain the keywords or not. # the score given is calculated by finding how many keywords occur in the url. def calculate_score(self,url): words = url.getAllText().split() for w in self.keywords: if w in words: self.score +=1 self.score = self.score / float(len(self.keywords)) return self.score
review = ' '.join(review) #review =' '.join((item for item in review if not item.isdigit())) corpus.append(review) ## lets find the unique words## corpus_word_count= corpus count =' '.join([str(elm) for elm in corpus_word_count ]) from nltk.tokenize.regexp import WordPunctTokenizer #This tokenizer also splits our string into tokens: my_toks = WordPunctTokenizer().tokenize(count) len(my_toks) ## unique word count my_vocab = set(my_toks) len(my_vocab) ## 6087 type(corpus) ##logistic regression....### from sklearn.linear_model import LogisticRegression vectorizer = TfidfVectorizer()
#!/usr/bin/env python import sys from nltk.tokenize.regexp import WordPunctTokenizer filename = sys.argv[1] with open(filename, 'r') as f: contents = f.read() from nltk.tokenize import wordpunct_tokenize wptk = WordPunctTokenizer() tokenized = wptk.tokenize(contents) lower_list = [] for word in tokenized: l = word.lower() lower_list.append(l) from collections import Counter counts = Counter(lower_list) #counts_list = sorted(lower_list, key=lambda x: (counts[x], x), reverse=True) print(counts)
def stem_text(text): tokenizer = WordPunctTokenizer() stemmer = SnowballStemmer('french') liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)] return ' '.join(liste_racines)
def __init__(self): self.stemmer = PorterStemmer() self.stopwords = nltk.corpus.stopwords.words('english') self.tokenizer = WordPunctTokenizer() self.keywords = [] self.score = 0
document += sentence + '\n' # Adjust number to make sure that a document doesn't exceed 15000 tokens if i % 500 == 0: documents.append(document) document = '\n' documents.append(document) print(f'Amount documents: {len(documents)}\n') # Check word count for every document # for i,doc in enumerate(documents): # word_count = len(doc.split()) # print(f'Doc {i}, word_count: {ord_count}') # Check token count for every document (should not be more than 15000) for i, doc in enumerate(documents): token_count = WordPunctTokenizer().tokenize(doc) # print(token_count) print(f'Doc {i}, token count: {len(token_count)}') ## # Feed each text document separately to BERN for i, doc in enumerate(tqdm(documents), 14): BERN_annotated_df = BERN_annotation(params, i, doc, sentences) # If csv doesn't exist yet, create one, and append to it in the next iterations with open('./output/subtitles/BERN_annotated_subtitles.csv', 'a') as f: BERN_annotated_df.to_csv(f, header=f.tell() == 0, index=False, sep=';') time.sleep(5)
def __init__(self, nltkTokenizer=WordPunctTokenizer()): self.nltkTokenizer = nltkTokenizer
- name abbreviations: E. Talvik ; M. Unt See https://github.com/estnltk/estnltk/issues/25 for more info. """ from __future__ import unicode_literals, print_function, absolute_import from nltk.tokenize.regexp import WordPunctTokenizer from nltk.tokenize.api import StringTokenizer import regex as re EST_ALPHA_LOWER = 'abcdefghijklmnoprsšzžtuvwõäöüxyz' EST_ALPHA_UPPER = EST_ALPHA_LOWER.upper() EST_ALPHA = EST_ALPHA_LOWER + EST_ALPHA_UPPER wptokenizer = WordPunctTokenizer() digits = re.compile('\d+') # Listing of different hypen/minus/dash symbols in utf8; # It is likely that these symbols are used interchangeably with the regular hypen symbol; hypens_dashes = re.compile( '^(-|\xad|\u2212|\uFF0D|\u02D7|\uFE63|\u002D|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|\u2212)$' ) def join_ordinals(left, right): return right == '.' and digits.match(left) is not None def join_hyphen(left, right): return hypens_dashes.match(left) or hypens_dashes.match(right)
return False current_key = None qtokens = None reduced_keys = {} for line in sys.stdin: query, tweet = line.split('\t', 1) if current_key != query: for last_tweet in reduced_keys: # TODO Reduce Processing Phase 2 # more processing against key's corpus print json.dumps(last_tweet) current_key = query qwords = WordPunctTokenizer().tokenize(query) qtokens = [w for w in qwords if not remove_if_punct_or_stopword(w)] reduced_keys = [] tweet = json.loads(tweet) # match query terms into tweet matches = 0 for token in qtokens: if token in tweet['parsed']: matches += 1 tweet['tokens'] = qtokens tweet['matches'] = matches tweet['cxScore'] = float(matches) / len(qtokens) tweet['totScore'] = tweet['cxScore'] + tweet['qlScore'] # TODO Reduce Processing Phase 1
def tokenize_text(self, text): words = WordPunctTokenizer().tokenize(text) return words
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() get_authors_query = """ SELECT name, long_name FROM author """ get_comments_query = """ SELECT id, content FROM comment """ cursor = connections[database].cursor() tokenizer = WordPunctTokenizer() self.stdout.write("Grabbing authors...") authors = [] cursor.execute(get_authors_query) self.pbar_setup(maxval=cursor.rowcount) for row in dictfetch(cursor): authors.append(row['name']) if row['long_name'] != "": authors.append(row['long_name']) self.pbar_increment() self.pbar_destroy() self.stdout.write("Sorting authors...") authors.sort() self.stdout.write("Determining real parents...") real_parents = [] cursor1 = connections[database].cursor() cursor1.execute(get_comments_query) self.pbar_setup(maxval=cursor1.rowcount) for row in dictfetch(cursor1): tokens = tokenizer.tokenize(row['content']) if len(tokens) > 0: if tokens[0] == '@': real_parents.append(int(row['id'])) else: i = bisect_left(authors, tokens[0]) if i != len(authors) and authors[i] == tokens[0]: real_parents.append(int(row['id'])) self.pbar_increment() self.pbar_destroy() self.stdout.write("Non-Real-parents found: {}".format( len(real_parents))) cursor2 = connections[database].cursor() update_query = """ UPDATE comment SET real_parent = (CASE WHEN id in ({}) THEN 0 ELSE 1 END) """.format(('%s,' * len(real_parents)).rstrip(',')) cursor2.execute(update_query, real_parents) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def __init__(self,): NltkTokenizer.__init__(self) _WordPunctTokenizer.__init__(self,)
#!/usr/bin/env python3.4 import os import random from collections import Counter from nltk.tokenize.regexp import WordPunctTokenizer nltk_tokenizer = WordPunctTokenizer() def tokenize(text): return nltk_tokenizer.tokenize(text.lower()) def loadcorpus(): """Load the corpus of abstracts and documents.""" dirname = "cmplg-txt" abstracts = {} documents = {} for fn in sorted(os.listdir(dirname)): docid = fn.split("-")[0] if fn.endswith("abstract.txt"): with open(os.path.join(dirname, fn), 'r') as f: abstracts[docid] = f.read() if fn.endswith("sentences.txt"): with open(os.path.join(dirname, fn), 'r') as f: documents[docid] = f.readlines() return abstracts, documents
def __init__(self, model): self.model = model self.tokenizer = WordPunctTokenizer() self.scale_score = lambda k: k * 2 - 1