Пример #1
0
def main(options):

    # https://stackoverflow.com/questions/45663121/about-stanford-word-segmenter
    # curl -O https://nlp.stanford.edu/software/stanford-corenlp-full-2016-10-31.zip
    # unzip stanford-corenlp-full-2016-10-31.zip && cd stanford-corenlp-full-2016-10-31
    #
    # java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
    #  -preload tokenize,ssplit,pos,lemma,ner,parse \
    #  -status_port 9001  -port 9001 -timeout 15000

    tokenizer = CoreNLPTokenizer('http://localhost:9001')
    for zip_source in options["zip_sources"]:
        outfilename = os.path.join(
            '..\\data',
            os.path.basename(zip_source)[:-4] + "_word_count.csv")
        outfilename = outfilename.replace('_text', '')
        with io.open(outfilename, 'w', encoding='utf8') as o:
            with zipfile.ZipFile(zip_source) as pope_zip:
                for filename in pope_zip.namelist():
                    with pope_zip.open(filename, 'r') as pope_file:
                        content = pope_file.read().decode('utf8')
                        try:
                            token_count = len(tokenizer.tokenize(content))
                        except:
                            token_count = len(content.split())
                            print("Failed: {} {}".format(
                                filename, token_count))
                        o.write('{};{};{}\n'.format(filename, len(content),
                                                    token_count))
Пример #2
0
def __clean_data_glove(data):
    """
    The function assumes input as a tuple of tuples as returned from db , cleans the data and returns a list of list
    The following cleaning steps are performed
    1) multiple receivers are separated by '|'
    2) all the strings are converted into lowercase
    3) email body is cleaned using stanfordtokenizer. It tokenizes the scentences into words. Punctuations are separated
    and considered as individual words. This is compatible with word2vec glove model which makes use of the same
    tokenizer
    """
    # st = StanfordTokenizer(path_to_jar='../resources/stanford-corenlp-3.9.1.jar')
    st = CoreNLPTokenizer()
    clean_mail = lambda x: (' '.join(st.tokenize(x))).encode('ascii', 'ignore')
    cleaned_data = []
    for i, row in enumerate(data):
        if i % 1000 == 0:
            print 100 * (i + 0.0) / len(data), '% emails processed'
        try:
            cleaned_row = list(row)
            # replace ',' separator in receivers with '|'
            cleaned_row[2] = cleaned_row[2].replace(',', '|')
            # convert the email body to lower case
            cleaned_row[3] = cleaned_row[3].lower()
            # put space after full stops since nltk can't separate those
            cleaned_row[3] = re.sub(r'\.(?=[^ \W\d])', '. ', cleaned_row[3])
            # use nltk stanford tokenizer to clean the email body
            cleaned_mail_thread = clean_mail(cleaned_row[3])
            cleaned_row[3] = __truncate_email(cleaned_mail_thread)
            # remove the first random id column and append ot to cleaned_data
            cleaned_data.append(cleaned_row[1:])
        except Exception as e:
            print i, row, e

    return cleaned_data
Пример #3
0
def get_entities(text):
    # return set(), set(), False
    sttok = CoreNLPTokenizer(url='http://localhost:9000')
    stner = CoreNLPNERTagger(url='http://localhost:9000')

    try:
        tokenized_text = sttok.tokenize(text)
        tagged_text = stner.tag(tokenized_text)

        people = []
        organizations = []

        for tag, chunk in groupby(tagged_text, lambda x: x[1]):
            if tag == 'ORGANIZATION':
                organization = " ".join(w for w, t in chunk)

                if not any(no in organization for no in non_org):
                    organizations.append(organization)

            if tag == 'PERSON':
                person = " ".join(w for w, t in chunk)

                if not any(np in person for np in non_person) and len(
                        person.split(' ')) != 1:
                    people.append(person)

        return set(organizations), set(people), False
    except Timeout as e:
        return None, None, True
Пример #4
0
def fetch_grams(url):
    "Fetch a webpage and return the text as unigrams and bigrams"

    sttok = CoreNLPTokenizer('http://localhost:9001')
    r = requests.get(url, timeout=5)
    r.encoding = 'utf-8'
    html = r.text
    text = text_from_html(html)
    unigrams = sttok.tokenize(text)
    bigrams = bigrams_to_str(list(nltk.bigrams(unigrams)))
    trigrams = trigrams_to_str(list(nltk.trigrams(unigrams)))
    return (unigrams, bigrams, trigrams)
def text_to_word_sequence_stanford(
        text,
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=True,
        split=" "):
    # TODO INSTALL CORENLP
    """Using the Stanford Tokenizer, converts a text to a sequence of words (or tokens).
    This overrides the text_to_word_sequences method of keras.preprocessing.text.
    We monkeypatch the default tokenization method to match the tokenizer used on the pre-trained word embeddings.
    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out. FOR COMP WITH SKLEARN
        lower: Whether to convert the input to lowercase. FOR COMP WITH SKLEARN
        split: Sentence split marker (string). FOR COMP WITH SKLEARN

    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    tokens = CoreNLPTokenizer().tokenize(text)
    return tokens
##Citation: https://pythonprogramming.net/combine-classifier-algorithms-nltk-tutorial/?completed=/sklearn-scikit-learn-nltk-tutorial/

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.classify.util as util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder as BCF
import itertools
import pickle
from nltk.tokenize.stanford import CoreNLPTokenizer
from nltk.tag.stanford import CoreNLPPOSTagger


sttok = CoreNLPTokenizer('http://127.0.0.1:9001')

def features(words):
	words = word_tokenize(words)
	scoreF = BigramAssocMeasures.chi_sq
	#bigram count
	n = 150
	bigrams = BCF.from_words(words).nbest(scoreF, n)

	return dict([word,True] for word in itertools.chain(words, bigrams))

def chinese_features(words):
	 words = sttok.tokenize(words)
	 scoreF = BigramAssocMeasures.chi_sq
	 #bigram count
	 n = 150
Пример #7
0
from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger
from nltk.tokenize.stanford import CoreNLPTokenizer

stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger(
    'http://localhost:9001')
sttok = CoreNLPTokenizer('http://localhost:9001')

sttok.tokenize(u'你好')

stpos.tag(u'basf')

stpos.tag(sttok.tokenize(u'text'))

stner.tag(u'你好')

stner.tag(sttok.tokenize(u'你好'))
Пример #8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import logging
import sys
import json

import tornado.ioloop
import tornado.web

from src.common.UserMessage import UserMessage
from src.common.WtfCommandMessage import WtfCommandMessage

from nltk.tokenize.stanford import CoreNLPTokenizer

sttok = CoreNLPTokenizer('http://corenlp:9000')

fake_stats = {}


class WtfHandler(tornado.web.RequestHandler):
    def __init__(self, application, request, **kwargs):
        super().__init__(application, request, **kwargs)

    def post(self):
        data = tornado.escape.json_decode(self.request.body)
        wtf_command = WtfCommandMessage(data["word"], data["username"])
        self.set_status(200)


class AllMessagesHandler(tornado.web.RequestHandler):