def get_freq_dist(recv, send, fd=None, dcount_smile=None, classes=None): """ Find word frequency distribution and count smile in the given text. Parameters ---------- recv : multiprocessing.Connection Read only send : multiprocessing.Connection Write only fd : dict Word frequency distributions dcount_smile : dict Smile counters """ stopwords = frozenset(nltk.corpus.stopwords.words('italian')).union( frozenset("[]':,(){}.?!*\"")).union(frozenset(("==", "--"))) tokenizer = nltk.PunktWordTokenizer() if not classes: classes = ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal user', 'all') # prepare a dict of empty Counter, one for every class if not fd: fd = {cls: Counter() for cls in classes} if not dcount_smile: dcount_smile = {cls: Counter() for cls in classes} while 1: try: cls, msg = recv.recv() except TypeError: # end for cls in set(classes).difference(('all', )): fd['all'].update(fd[cls]) dcount_smile['all'].update(dcount_smile[cls]) ## send word counters to the main process ## TODO: change it into Counter.most_commons(1000) send.send([(cls, freq.most_common(1000)) for cls, freq in fd.iteritems()]) # send smile counters to the main process send.send([(cls, counters.items()) for cls, counters in dcount_smile.iteritems()]) return msg = remove_templates(msg.encode('utf-8')) count_smile = find_smiles(msg) dcount_smile[cls].update(count_smile) tokens = tokenizer.tokenize(nltk.clean_html(msg.lower())) tokens = [t for t in tokens if t not in stopwords] fd[cls].update(tokens)
def processOneIngredient(ing, ingDict, allIng, measures): solution = {} tokens = nltk.PunktWordTokenizer().tokenize(ing) if tokens == []: return allIng else: num, units = extractQM(tokens, ingDict, measures) desc, name, prep = extractIngredient(tokens, ingDict, units, num) weight = calculateWeight(name, ingDict, num, units) units = stripChars(units) desc = stripChars(desc) name = stripChars(name) prep = stripChars(prep) allIng[name] = { "name": name, "weight": weight, "quantity": num, "measurement": units, "description": desc, "preparation": prep } return allIng
def main(query, lang): langMap = {'es': 'Spanish', 'en': 'English'} stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower()) j = wikiApi.get_article(query, lang) wordDict = {} for page in j: t = wikiParser(j[page]['content']) for header in t.headers: try: stemmedHeader = stemmer.stem(header) except Exception, e: print str(e) header = unidecode(header) stemmedHeader = stemmer.stem(header) if stemmedHeader in wordDict: wordDict[stemmedHeader]['count'] = 1 else: wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader} text = t.text print type(text) tokens = [ k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text) if re.match('[a-zA-Z]', k) ] words = [ w.lower() for w in tokens if w.encode('utf-8').lower() not in nltk.corpus.stopwords.words(langMap[lang].lower()) ] print len(words) for w in words: try: st = stemmer.stem(w) except Exception, e: st = stemmer.stem(unidecode(w)) w = unidecode(w) continue if st in wordDict: wordDict[st]['count'] += 1 else: wordDict[st] = {'count': 1, 'form': w}
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import nltk stopwords = nltk.corpus.stopwords.words('english') tokenizer = nltk.PunktWordTokenizer() stemmer = nltk.PorterStemmer() lemmatizer = nltk.WordNetLemmatizer() def process_text(text): text = text.lower() # Tokenizing tokens = [ token for token in tokenizer.tokenize(text) if token not in stopwords ] # Stemming tokens = map(stemmer.stem, tokens) # # Lemmatizing # tokens = map(lemmatizer.lemmatize, tokens) return tokens if __name__ == '__main__': df = pd.read_csv('dataset.csv', nrows=80000, error_bad_lines=False) tagged_tokens = []
def __init__(self, **kwargs): super(HistoryWordsPageProcessor, self).__init__(**kwargs) self.tokenizer = nltk.PunktWordTokenizer() self.stopwords = frozenset(nltk.corpus.stopwords.words('italian')) self.counter_desired_words = nltk.FreqDist()
LOG_FORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s" #tagger = stanford.StanfordTagger('/media/data/NER/stanford/pos/models/left3words-wsj-0-18.tagger', # '/media/data/NER/stanford/pos/stanford-postagger.jar', # encoding='utf-8') tagger = senna.SennaTagger('/media/petra/NER/senna-v2.0', encoding='utf-8') i = 0 size = 0 samples = [] lock = Lock() sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') tree_tokenizer = nltk.TreebankWordTokenizer() word_punct_tokenizer = nltk.WordPunctTokenizer() punkt_word_tokenizer = nltk.PunktWordTokenizer() whitespace_tokenizer = nltk.WhitespaceTokenizer() def tokenize(text): sentences = filter(lambda x: x, sent_tokenizer.tokenize(text.strip())) tokens = [ punkt_word_tokenizer.tokenize(sentence) for sentence in sentences ] return tokens def process(labeled_comments): global i ids, comments, langs, users, page_ids, page_titles, times, levels = zip( *labeled_comments)