Python SentenceTokenizer.itokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: textblob.tokenizers

클래스/타입: SentenceTokenizer

메소드/함수: itokenize

hotexamples.com에서의 예제들: 7

Python SentenceTokenizer.itokenize - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 textblob.tokenizers.SentenceTokenizer.itokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

SentenceTokenizer(14)

tokenize(9)

itokenize(3)

예제 #1

파일 보기

 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects given
     a list of sentence strings. Attempts to handle sentences that
     have more than one punctuation mark at the end of the sentence.
     Examples: "An ellipses is no problem..." or "This is awesome!!!"
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent,
                      start_index=start_index,
                      end_index=end_index,
                      tokenizer=self.tokenizer,
                      np_extractor=self.np_extractor,
                      pos_tagger=self.pos_tagger,
                      analyzer=self.analyzer,
                      parser=self.parser,
                      classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects

예제 #2

파일 보기

class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    @attr("skip")  # This is a known problem with the sentence tokenizer.
    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

    def test_sent_tokenize(self):
        tokens = sent_tokenize(self.text)
        assert_true(is_generator(tokens))  # It's a generator
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))

예제 #3

파일 보기

파일: test_tokenizers.py 프로젝트: ANDRESVA/TextBlob

class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    @attr("skip")  # This is a known problem with the sentence tokenizer.
    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

    def test_sent_tokenize(self):
        tokens = sent_tokenize(self.text)
        assert_true(is_generator(tokens))  # It's a generator
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))

예제 #4

파일 보기

파일: blob.py 프로젝트: pathouse/TextBlob

 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects from the raw text.
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent, start_index=start_index, end_index=end_index,
             tokenizer=self.tokenizer, np_extractor=self.np_extractor,
             pos_tagger=self.pos_tagger, analyzer=self.analyzer,
             parser=self.parser, classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects

예제 #5

파일 보기

class TestSentenceTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            "Beautiful is better than ugly.", "Simple is better than complex."
        ])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
                     ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")

예제 #6

파일 보기

파일: blob.py 프로젝트: DDani/TextBlob

 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects given
     a list of sentence strings. Attempts to handle sentences that
     have more than one punctuation mark at the end of the sentence.
     Examples: "An ellipses is no problem..." or "This is awesome!!!"
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent, start_index=start_index, end_index=end_index,
             tokenizer=self.tokenizer, np_extractor=self.np_extractor,
             pos_tagger=self.pos_tagger, analyzer=self.analyzer,
             parser=self.parser, classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects

예제 #7

파일 보기

파일: test_tokenizers.py 프로젝트: atesty/TextBlob

class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Beautiful is better than ugly.")
        assert_equal(next(gen), "Simple is better than complex.")