Python PunktSentenceTokenizer.sentences_from_text 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.tokenize

메소드/함수: sentences_from_text

hotexamples.com에서의 예제들: 9

Python PunktSentenceTokenizer.sentences_from_text - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.tokenize.PunktSentenceTokenizer.sentences_from_text에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PunktSentenceTokenizer(30)

tokenize(30)

span_tokenize(9)

sentences_from_text(7)

train(2)

difference(1)

intersection(1)

sentences_from_tokens(1)

span_tokenize_sents(1)

tokenizer(1)

union(1)

예제 #1

파일 보기

파일: server.py 프로젝트: electrone901/nurtured

def get_run_ons(string):
    sent_detector = PunktSentenceTokenizer()
    sentences = sent_detector.sentences_from_text(string)

    run_count = 0

    for x in sentences:
        sample = x
        wc = get_word_count(x)
        tokenizer = RegexpTokenizer(r'[^\w\s]')
        pc = len(tokenizer.tokenize(sample))
        if wc > 16 and pc < 3:
            run_count += 1

    return run_count

예제 #2

파일 보기

파일: server.py 프로젝트: electrone901/nurtured

def get_sentence_count(string):
    sent_detector = PunktSentenceTokenizer()
    count = len(sent_detector.sentences_from_text(string))
    return count

예제 #3

파일 보기

파일: news_cat_pred.py 프로젝트: jbrambleDC/newsIQ

pst = PunktSentenceTokenizer()
files =[]
words = []
##allow to take a directory
for i in sys.argv:
  match = re.match(".*\.txt$", i)
  if match:
    files.append(i)

print 'file_name' + '\t' + 'politcal_stance'

for f in files:
  with open(f,"rb") as class_file:
    if sys.argv[1] == '--sents':
      data = class_file.read().replace('\n', '')
      sents = pst.sentences_from_text(data)
      for sent in sents:
        sent_words = nltk.word_tokenize(sent)
        for word in sent_words:
          words.append(word)
      feats = dict([(word, True) for word in words])

    else:
      for line in class_file:
        line_words = nltk.word_tokenize(line)
        for word in line_words:
          words.append(word)
      feats = dict([(word, True) for word in words])

    print f + '\t' + classifier.classify(feats)

예제 #4

파일 보기

파일: news_cat_pred.py 프로젝트: jbrambleDC/newsIQ

pst = PunktSentenceTokenizer()
files = []
words = []
##allow to take a directory
for i in sys.argv:
    match = re.match(".*\.txt$", i)
    if match:
        files.append(i)

print 'file_name' + '\t' + 'politcal_stance'

for f in files:
    with open(f, "rb") as class_file:
        if sys.argv[1] == '--sents':
            data = class_file.read().replace('\n', '')
            sents = pst.sentences_from_text(data)
            for sent in sents:
                sent_words = nltk.word_tokenize(sent)
                for word in sent_words:
                    words.append(word)
            feats = dict([(word, True) for word in words])

        else:
            for line in class_file:
                line_words = nltk.word_tokenize(line)
                for word in line_words:
                    words.append(word)
            feats = dict([(word, True) for word in words])

        print f + '\t' + classifier.classify(feats)

예제 #5

파일 보기

df.transpose().sort_values(0, ascending=False).head(10).transpose()

# Hash Vectorizer, more for big data
from sklearn.feature_extraction.text import HashingVectorizer
hvec = HashingVectorizer()
hvec.fit([spam])

df  = pd.DataFrame(hvec.transform([spam]).todense())
df.transpose().sort_values(0, ascending=False).head(10).transpose()

# Breaks up sentences and puts them into an array
from nltk.tokenize import PunktSentenceTokenizer
easy_text = "I went to the zoo today. What do you think of that? I bet you hate it! Or maybe you don't"
sent_detector = PunktSentenceTokenizer()
sent_detector.sentences_from_text(easy_text)

"""
Out[6]: 
['I went to the zoo today.',
 'What do you think of that?',
 'I bet you hate it!',
 "Or maybe you don't"]
"""

# Auto stems the best way
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print stemmer.stem('Swimmed')
print stemmer.stem('Swimming')
"""

예제 #6

파일 보기

from nltk.tokenize import PunktSentenceTokenizer
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

txt = "I was in the room in Los Angeles in 1988, about 200 feet from Michael Dukakis, when Bernard Shaw asked him what he'd do if his wife were raped. Now that really was a sucker punch of a question. I was on the other side of the arena, in Cleveland, when Donald Trump bared his teeth (metaphorically speaking) at Megyn Kelly."

pst = PunktSentenceTokenizer()
sents = pst.sentences_from_text(txt.encode('utf-8').replace('\n',''))
print sents
bigrams = ngrams(word_tokenize(sents[1].replace('.','')),2)
print bigrams

예제 #7

파일 보기

파일: text_parser.py 프로젝트: antonfait/SerchEngine

def sentence_parser( input_str ):

    sentences_tokenize = PunktSentenceTokenizer()
    for sentence in sentences_tokenize.sentences_from_text( input_str ):
        if sentence.strip() != u'':
            yield sentence

예제 #8

파일 보기

wt = WPT()  #обозначение переменных как класс

Names1 = {}  #создаем словарь для имен из первого текста
Names2 = {}  #создаем словарь для имен из второго текста
my_file1 = open(
    "some1.txt", "r", encoding='utf-8'
)  #открыть файл с именем some1.txt с кодировкой utf-8 на чтение
my_file2 = open(
    "some2.txt", "r", encoding='utf-8'
)  #открыть файл с именем some2.txt с кодировкой utf-8 на чтение
text1 = my_file1.read(
)  #в переменную text1 запоминаем все данные из файла some1.txt
text2 = my_file2.read(
)  #в переменную text2 запоминаем все данные из файла some2.txt

text1_ = st.sentences_from_text(text1)
lenS1 = len(text1_)
text2_ = st.sentences_from_text(text2)
lenS2 = len(text2_)


def paral1(q1, q21):  # определяем функцию с двумя аргументами - очередями
    _Kx = q21.get(
    )  # вытаскиваем список первого и последнего предложения, используемые для данного процесса, из q21
    for k in _Kx:
        xNames = text1_[k]
        for word in wt.tokenize(
                sentence):  #бежим по словам в выделенном тексте
            m = Mystem()
            analize = m.analyze(word)  #Морфологический анализ слова
            print(m.analyze(word))

예제 #9

파일 보기

from nltk.tokenize import PunktSentenceTokenizer as PST  #Класс для выделения предложений
from nltk.tokenize import WordPunctTokenizer as WPT      #класс для разделения слов в предложении
from pymystem3 import Mystem #морфологический анализатор для русского языка

st = PST() #обозначение переменных как класс
wt = WPT() #обозначение переменных как класс


Names1 = {} #создаем словарь для имен из первого текста
Names2 = {} #создаем словарь для имен из второго текста
my_file1 = open("some1.txt", "r", encoding='utf-8') #открыть файл с именем some1.txt с кодировкой utf-8 на чтение
my_file2 = open("some2.txt", "r", encoding='utf-8') #открыть файл с именем some2.txt с кодировкой utf-8 на чтение
text1 = my_file1.read() #в переменную text1 запоминаем все данные из файла some1.txt
text2 = my_file2.read() #в переменную text2 запоминаем все данные из файла some2.txt

for sentence in st.sentences_from_text(text1):	#выделяем из текста1 предложение и бежим по нему
    for word in wt.tokenize(sentence):	#бежим по словам в выделенном тексте
        m = Mystem()
        analize = m.analyze(word) #Морфологический анализ слова
        for i in analize: #углубляемся в полученный словарь
            for j in i: 
                for k in i[j]:
                    for m in k:
                        if "gr" in k:
                            for o in k[m]:
                                if "муж" and "имя" in k[m]: #Проверяем есть ли параметры муж и имя
                                    if Names1.get(word) is None: #Если в словаре имен нет такого имени
                                        Names1.update({word: 1}) #добавляем его
                                    else:
                                        Names1[word] +=1 #Иначе инкрементируем индекс
                                    break #выходим из цикла разбора анализа