def _create_sentence_objects(self): '''Returns a list of Sentence objects given a list of sentence strings. Attempts to handle sentences that have more than one punctuation mark at the end of the sentence. Examples: "An ellipses is no problem..." or "This is awesome!!!" ''' sent_tokenizer = SentenceTokenizer() sentence_objects = [] sentences = sent_tokenizer.itokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) @attr("skip") # This is a known problem with the sentence tokenizer. def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.") def test_sent_tokenize(self): tokens = sent_tokenize(self.text) assert_true(is_generator(tokens)) # It's a generator assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
def _create_sentence_objects(self): '''Returns a list of Sentence objects from the raw text. ''' sent_tokenizer = SentenceTokenizer() sentence_objects = [] sentences = sent_tokenizer.itokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ "Beautiful is better than ugly.", "Simple is better than complex." ]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.")
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"]) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Beautiful is better than ugly.") assert_equal(next(gen), "Simple is better than complex.")