def main(options): # https://stackoverflow.com/questions/45663121/about-stanford-word-segmenter # curl -O https://nlp.stanford.edu/software/stanford-corenlp-full-2016-10-31.zip # unzip stanford-corenlp-full-2016-10-31.zip && cd stanford-corenlp-full-2016-10-31 # # java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer # -preload tokenize,ssplit,pos,lemma,ner,parse \ # -status_port 9001 -port 9001 -timeout 15000 tokenizer = CoreNLPTokenizer('http://localhost:9001') for zip_source in options["zip_sources"]: outfilename = os.path.join( '..\\data', os.path.basename(zip_source)[:-4] + "_word_count.csv") outfilename = outfilename.replace('_text', '') with io.open(outfilename, 'w', encoding='utf8') as o: with zipfile.ZipFile(zip_source) as pope_zip: for filename in pope_zip.namelist(): with pope_zip.open(filename, 'r') as pope_file: content = pope_file.read().decode('utf8') try: token_count = len(tokenizer.tokenize(content)) except: token_count = len(content.split()) print("Failed: {} {}".format( filename, token_count)) o.write('{};{};{}\n'.format(filename, len(content), token_count))
def __clean_data_glove(data): """ The function assumes input as a tuple of tuples as returned from db , cleans the data and returns a list of list The following cleaning steps are performed 1) multiple receivers are separated by '|' 2) all the strings are converted into lowercase 3) email body is cleaned using stanfordtokenizer. It tokenizes the scentences into words. Punctuations are separated and considered as individual words. This is compatible with word2vec glove model which makes use of the same tokenizer """ # st = StanfordTokenizer(path_to_jar='../resources/stanford-corenlp-3.9.1.jar') st = CoreNLPTokenizer() clean_mail = lambda x: (' '.join(st.tokenize(x))).encode('ascii', 'ignore') cleaned_data = [] for i, row in enumerate(data): if i % 1000 == 0: print 100 * (i + 0.0) / len(data), '% emails processed' try: cleaned_row = list(row) # replace ',' separator in receivers with '|' cleaned_row[2] = cleaned_row[2].replace(',', '|') # convert the email body to lower case cleaned_row[3] = cleaned_row[3].lower() # put space after full stops since nltk can't separate those cleaned_row[3] = re.sub(r'\.(?=[^ \W\d])', '. ', cleaned_row[3]) # use nltk stanford tokenizer to clean the email body cleaned_mail_thread = clean_mail(cleaned_row[3]) cleaned_row[3] = __truncate_email(cleaned_mail_thread) # remove the first random id column and append ot to cleaned_data cleaned_data.append(cleaned_row[1:]) except Exception as e: print i, row, e return cleaned_data
def get_entities(text): # return set(), set(), False sttok = CoreNLPTokenizer(url='http://localhost:9000') stner = CoreNLPNERTagger(url='http://localhost:9000') try: tokenized_text = sttok.tokenize(text) tagged_text = stner.tag(tokenized_text) people = [] organizations = [] for tag, chunk in groupby(tagged_text, lambda x: x[1]): if tag == 'ORGANIZATION': organization = " ".join(w for w, t in chunk) if not any(no in organization for no in non_org): organizations.append(organization) if tag == 'PERSON': person = " ".join(w for w, t in chunk) if not any(np in person for np in non_person) and len( person.split(' ')) != 1: people.append(person) return set(organizations), set(people), False except Timeout as e: return None, None, True
def fetch_grams(url): "Fetch a webpage and return the text as unigrams and bigrams" sttok = CoreNLPTokenizer('http://localhost:9001') r = requests.get(url, timeout=5) r.encoding = 'utf-8' html = r.text text = text_from_html(html) unigrams = sttok.tokenize(text) bigrams = bigrams_to_str(list(nltk.bigrams(unigrams))) trigrams = trigrams_to_str(list(nltk.trigrams(unigrams))) return (unigrams, bigrams, trigrams)
def text_to_word_sequence_stanford( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "): # TODO INSTALL CORENLP """Using the Stanford Tokenizer, converts a text to a sequence of words (or tokens). This overrides the text_to_word_sequences method of keras.preprocessing.text. We monkeypatch the default tokenization method to match the tokenizer used on the pre-trained word embeddings. # Arguments text: Input text (string). filters: Sequence of characters to filter out. FOR COMP WITH SKLEARN lower: Whether to convert the input to lowercase. FOR COMP WITH SKLEARN split: Sentence split marker (string). FOR COMP WITH SKLEARN # Returns A list of words (or tokens). """ if lower: text = text.lower() tokens = CoreNLPTokenizer().tokenize(text) return tokens
##Citation: https://pythonprogramming.net/combine-classifier-algorithms-nltk-tutorial/?completed=/sklearn-scikit-learn-nltk-tutorial/ import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import nltk.classify.util as util from nltk.classify import NaiveBayesClassifier from nltk.metrics import BigramAssocMeasures from nltk.collocations import BigramCollocationFinder as BCF import itertools import pickle from nltk.tokenize.stanford import CoreNLPTokenizer from nltk.tag.stanford import CoreNLPPOSTagger sttok = CoreNLPTokenizer('http://127.0.0.1:9001') def features(words): words = word_tokenize(words) scoreF = BigramAssocMeasures.chi_sq #bigram count n = 150 bigrams = BCF.from_words(words).nbest(scoreF, n) return dict([word,True] for word in itertools.chain(words, bigrams)) def chinese_features(words): words = sttok.tokenize(words) scoreF = BigramAssocMeasures.chi_sq #bigram count n = 150
from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger from nltk.tokenize.stanford import CoreNLPTokenizer stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger( 'http://localhost:9001') sttok = CoreNLPTokenizer('http://localhost:9001') sttok.tokenize(u'你好') stpos.tag(u'basf') stpos.tag(sttok.tokenize(u'text')) stner.tag(u'你好') stner.tag(sttok.tokenize(u'你好'))
#!/usr/bin/python # -*- coding: utf-8 -*- import logging import sys import json import tornado.ioloop import tornado.web from src.common.UserMessage import UserMessage from src.common.WtfCommandMessage import WtfCommandMessage from nltk.tokenize.stanford import CoreNLPTokenizer sttok = CoreNLPTokenizer('http://corenlp:9000') fake_stats = {} class WtfHandler(tornado.web.RequestHandler): def __init__(self, application, request, **kwargs): super().__init__(application, request, **kwargs) def post(self): data = tornado.escape.json_decode(self.request.body) wtf_command = WtfCommandMessage(data["word"], data["username"]) self.set_status(200) class AllMessagesHandler(tornado.web.RequestHandler):