def test_words_tokenizes_the_sentence_correctly(self): t = TextClassificationInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} t = TextClassificationInstance("This isn't a sentence.", None) assert t.words() == {'words': ['this', 'is', "n't", 'a', 'sentence', '.']} t = TextClassificationInstance("And, I have commas.", None) assert t.words() == {'words': ['and', ',', 'i', 'have', 'commas', '.']}
def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') capital_a_index = data_indexer.add_word_to_index("A", namespace='words') space_index = data_indexer.add_word_to_index(" ", namespace='words') a_index = data_indexer.add_word_to_index("a", namespace='words') s_index = data_indexer.add_word_to_index("s", namespace='words') e_index = data_indexer.add_word_to_index("e", namespace='words') n_index = data_indexer.add_word_to_index("n", namespace='words') t_index = data_indexer.add_word_to_index("t", namespace='words') c_index = data_indexer.add_word_to_index("c", namespace='words') a_char_index = data_indexer.add_word_to_index("a", namespace='characters') s_char_index = data_indexer.add_word_to_index("s", namespace='characters') e_char_index = data_indexer.add_word_to_index("e", namespace='characters') n_char_index = data_indexer.add_word_to_index("n", namespace='characters') t_char_index = data_indexer.add_word_to_index("t", namespace='characters') c_char_index = data_indexer.add_word_to_index("c", namespace='characters') instance = TextClassificationInstance( "A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [a_index, sentence_index] TextInstance.tokenizer = tokenizers['characters'](Params({})) instance = TextClassificationInstance( "A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [ capital_a_index, space_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index ] TextInstance.tokenizer = tokenizers['words and characters'](Params({})) instance = TextClassificationInstance( "A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [[a_index, a_char_index], [ sentence_index, s_char_index, e_char_index, n_char_index, t_char_index, e_char_index, n_char_index, c_char_index, e_char_index ]]
def test_words_tokenizes_the_sentence_correctly(self): t = TextClassificationInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} TextInstance.tokenizer = tokenizers['characters'](Params({})) assert t.words() == { 'words': [ 'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.' ] } TextInstance.tokenizer = tokenizers['words and characters'](Params({})) assert t.words() == { 'words': ['this', 'is', 'a', 'sentence', '.'], 'characters': [ 't', 'h', 'i', 's', 'i', 's', 'a', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.' ] }
def test_read_from_line_handles_two_column_with_index(self): index = 23 text = "this is a sentence" label = None line = self.instance_to_line(text, label, index) instance = TextClassificationInstance.read_from_line(line) assert instance.text == text assert instance.label is label assert instance.index == index
def test_get_nearest_neighbors_does_not_crash(self): args = Params({ 'corpus_path': self.corpus_path, 'model_serialization_prefix': './', 'num_sentence_words': 5, }) model = self.get_model(DifferentiableSearchMemoryNetwork, args) model.encoder_model = FakeEncoder() model._initialize_lsh() model.num_sentence_words = 5 model.max_knowledge_length = 2 model.get_nearest_neighbors( TextClassificationInstance("this is a sentence", True))
def test_read_from_line_handles_one_column(self): text = "this is a sentence" instance = TextClassificationInstance.read_from_line(text) assert instance.text == text assert instance.label is None assert instance.index is None