Python CoreNLPTokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: nltk.tokenize.stanford

Класс/Тип: CoreNLPTokenizer

Примеров на hotexamples.com: 8

Python CoreNLPTokenizer - 8 примеров найдено. Это лучшие примеры Python кода для nltk.tokenize.stanford.CoreNLPTokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CoreNLPTokenizer(8)

tokenize(5)

Основные методы

CoreNLPTokenizer (8)

tokenize (5)

Пример #1

Показать файл

Файл: count_words.py Проект: humlab/text_analytic_tools

def main(options):

    # https://stackoverflow.com/questions/45663121/about-stanford-word-segmenter
    # curl -O https://nlp.stanford.edu/software/stanford-corenlp-full-2016-10-31.zip
    # unzip stanford-corenlp-full-2016-10-31.zip && cd stanford-corenlp-full-2016-10-31
    #
    # java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer
    #  -preload tokenize,ssplit,pos,lemma,ner,parse \
    #  -status_port 9001  -port 9001 -timeout 15000

    tokenizer = CoreNLPTokenizer('http://localhost:9001')
    for zip_source in options["zip_sources"]:
        outfilename = os.path.join(
            '..\\data',
            os.path.basename(zip_source)[:-4] + "_word_count.csv")
        outfilename = outfilename.replace('_text', '')
        with io.open(outfilename, 'w', encoding='utf8') as o:
            with zipfile.ZipFile(zip_source) as pope_zip:
                for filename in pope_zip.namelist():
                    with pope_zip.open(filename, 'r') as pope_file:
                        content = pope_file.read().decode('utf8')
                        try:
                            token_count = len(tokenizer.tokenize(content))
                        except:
                            token_count = len(content.split())
                            print("Failed: {} {}".format(
                                filename, token_count))
                        o.write('{};{};{}\n'.format(filename, len(content),
                                                    token_count))

Пример #2

Показать файл

def __clean_data_glove(data):
    """
    The function assumes input as a tuple of tuples as returned from db , cleans the data and returns a list of list
    The following cleaning steps are performed
    1) multiple receivers are separated by '|'
    2) all the strings are converted into lowercase
    3) email body is cleaned using stanfordtokenizer. It tokenizes the scentences into words. Punctuations are separated
    and considered as individual words. This is compatible with word2vec glove model which makes use of the same
    tokenizer
    """
    # st = StanfordTokenizer(path_to_jar='../resources/stanford-corenlp-3.9.1.jar')
    st = CoreNLPTokenizer()
    clean_mail = lambda x: (' '.join(st.tokenize(x))).encode('ascii', 'ignore')
    cleaned_data = []
    for i, row in enumerate(data):
        if i % 1000 == 0:
            print 100 * (i + 0.0) / len(data), '% emails processed'
        try:
            cleaned_row = list(row)
            # replace ',' separator in receivers with '|'
            cleaned_row[2] = cleaned_row[2].replace(',', '|')
            # convert the email body to lower case
            cleaned_row[3] = cleaned_row[3].lower()
            # put space after full stops since nltk can't separate those
            cleaned_row[3] = re.sub(r'\.(?=[^ \W\d])', '. ', cleaned_row[3])
            # use nltk stanford tokenizer to clean the email body
            cleaned_mail_thread = clean_mail(cleaned_row[3])
            cleaned_row[3] = __truncate_email(cleaned_mail_thread)
            # remove the first random id column and append ot to cleaned_data
            cleaned_data.append(cleaned_row[1:])
        except Exception as e:
            print i, row, e

    return cleaned_data

Пример #3

Показать файл

def get_entities(text):
    # return set(), set(), False
    sttok = CoreNLPTokenizer(url='http://localhost:9000')
    stner = CoreNLPNERTagger(url='http://localhost:9000')

    try:
        tokenized_text = sttok.tokenize(text)
        tagged_text = stner.tag(tokenized_text)

        people = []
        organizations = []

        for tag, chunk in groupby(tagged_text, lambda x: x[1]):
            if tag == 'ORGANIZATION':
                organization = " ".join(w for w, t in chunk)

                if not any(no in organization for no in non_org):
                    organizations.append(organization)

            if tag == 'PERSON':
                person = " ".join(w for w, t in chunk)

                if not any(np in person for np in non_person) and len(
                        person.split(' ')) != 1:
                    people.append(person)

        return set(organizations), set(people), False
    except Timeout as e:
        return None, None, True

Пример #4

Показать файл

def fetch_grams(url):
    "Fetch a webpage and return the text as unigrams and bigrams"

    sttok = CoreNLPTokenizer('http://localhost:9001')
    r = requests.get(url, timeout=5)
    r.encoding = 'utf-8'
    html = r.text
    text = text_from_html(html)
    unigrams = sttok.tokenize(text)
    bigrams = bigrams_to_str(list(nltk.bigrams(unigrams)))
    trigrams = trigrams_to_str(list(nltk.trigrams(unigrams)))
    return (unigrams, bigrams, trigrams)

Пример #5

Показать файл

Файл: featurizer.py Проект: yyht/sentivent-economic-event-detection

def text_to_word_sequence_stanford(
        text,
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=True,
        split=" "):
    # TODO INSTALL CORENLP
    """Using the Stanford Tokenizer, converts a text to a sequence of words (or tokens).
    This overrides the text_to_word_sequences method of keras.preprocessing.text.
    We monkeypatch the default tokenization method to match the tokenizer used on the pre-trained word embeddings.
    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out. FOR COMP WITH SKLEARN
        lower: Whether to convert the input to lowercase. FOR COMP WITH SKLEARN
        split: Sentence split marker (string). FOR COMP WITH SKLEARN

    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    tokens = CoreNLPTokenizer().tokenize(text)
    return tokens

Пример #6

Показать файл

Файл: training_classifier.py Проект: herbherbherb/CS410-Youtube-Review-Miner

##Citation: https://pythonprogramming.net/combine-classifier-algorithms-nltk-tutorial/?completed=/sklearn-scikit-learn-nltk-tutorial/

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.classify.util as util
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder as BCF
import itertools
import pickle
from nltk.tokenize.stanford import CoreNLPTokenizer
from nltk.tag.stanford import CoreNLPPOSTagger


sttok = CoreNLPTokenizer('http://127.0.0.1:9001')

def features(words):
	words = word_tokenize(words)
	scoreF = BigramAssocMeasures.chi_sq
	#bigram count
	n = 150
	bigrams = BCF.from_words(words).nbest(scoreF, n)

	return dict([word,True] for word in itertools.chain(words, bigrams))

def chinese_features(words):
	 words = sttok.tokenize(words)
	 scoreF = BigramAssocMeasures.chi_sq
	 #bigram count
	 n = 150

Пример #7

Показать файл

from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger
from nltk.tokenize.stanford import CoreNLPTokenizer

stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger(
    'http://localhost:9001')
sttok = CoreNLPTokenizer('http://localhost:9001')

sttok.tokenize(u'你好')

stpos.tag(u'basf')

stpos.tag(sttok.tokenize(u'text'))

stner.tag(u'你好')

stner.tag(sttok.tokenize(u'你好'))

Пример #8

Показать файл

#!/usr/bin/python
# -*- coding: utf-8 -*-

import logging
import sys
import json

import tornado.ioloop
import tornado.web

from src.common.UserMessage import UserMessage
from src.common.WtfCommandMessage import WtfCommandMessage

from nltk.tokenize.stanford import CoreNLPTokenizer

sttok = CoreNLPTokenizer('http://corenlp:9000')

fake_stats = {}


class WtfHandler(tornado.web.RequestHandler):
    def __init__(self, application, request, **kwargs):
        super().__init__(application, request, **kwargs)

    def post(self):
        data = tornado.escape.json_decode(self.request.body)
        wtf_command = WtfCommandMessage(data["word"], data["username"])
        self.set_status(200)


class AllMessagesHandler(tornado.web.RequestHandler):