def test_words_tokenizes_the_sentence_correctly(self): t = TrueFalseInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} t = TrueFalseInstance("This isn't a sentence.", None) assert t.words() == { 'words': ['this', 'is', "n't", 'a', 'sentence', '.'] } t = TrueFalseInstance("And, I have commas.", None) assert t.words() == {'words': ['and', ',', 'i', 'have', 'commas', '.']}
def test_words_tokenizes_the_sentence_correctly(self): t = TrueFalseInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} TextInstance.tokenizer = tokenizers['characters']({}) assert t.words() == {'characters': ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']} TextInstance.tokenizer = tokenizers['words and characters']({}) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.'], 'characters': ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']} TextInstance.tokenizer = tokenizers['words']({})