def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sentence_objects = [] sentences = sent_tokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
def test_sent_tokenize(self): tokens = sent_tokenize(self.text) assert_true(is_generator(tokens)) # It's a generator assert_equal(list(tokens), self.tokenizer.tokenize(self.text))