def test_to_indexed_instance_converts_correctly(self): data_indexer = DataIndexer() a_word_index = data_indexer.add_word_to_index("a", namespace='words') sentence_index = data_indexer.add_word_to_index("sentence", namespace='words') capital_a_index = data_indexer.add_word_to_index("A", namespace='characters') space_index = data_indexer.add_word_to_index(" ", namespace='characters') a_index = data_indexer.add_word_to_index("a", namespace='characters') s_index = data_indexer.add_word_to_index("s", namespace='characters') e_index = data_indexer.add_word_to_index("e", namespace='characters') n_index = data_indexer.add_word_to_index("n", namespace='characters') t_index = data_indexer.add_word_to_index("t", namespace='characters') c_index = data_indexer.add_word_to_index("c", namespace='characters') instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [a_word_index, sentence_index] TextInstance.tokenizer = tokenizers['characters']({}) instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [capital_a_index, space_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index] TextInstance.tokenizer = tokenizers['words and characters']({}) instance = TrueFalseInstance("A sentence", None).to_indexed_instance(data_indexer) assert instance.word_indices == [[a_word_index, a_index], [sentence_index, s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index]] TextInstance.tokenizer = tokenizers['words']({})
def test_merge(self): instances = [ TrueFalseInstance("testing", None, None), TrueFalseInstance("testing1", None, None) ] dataset1 = Dataset(instances[:1]) dataset2 = Dataset(instances[1:]) merged = dataset1.merge(dataset2) assert merged.instances == instances
def test_words_tokenizes_the_sentence_correctly(self): t = TrueFalseInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} TextInstance.tokenizer = tokenizers['characters']({}) assert t.words() == {'characters': ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']} TextInstance.tokenizer = tokenizers['words and characters']({}) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.'], 'characters': ['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.']} TextInstance.tokenizer = tokenizers['words']({})
def test_words_tokenizes_the_sentence_correctly(self): t = TrueFalseInstance("This is a sentence.", None) assert t.words() == {'words': ['this', 'is', 'a', 'sentence', '.']} t = TrueFalseInstance("This isn't a sentence.", None) assert t.words() == { 'words': ['this', 'is', "n't", 'a', 'sentence', '.'] } t = TrueFalseInstance("And, I have commas.", None) assert t.words() == {'words': ['and', ',', 'i', 'have', 'commas', '.']}
def read_instance_message(self, instance_message): # pylint: disable=redefined-variable-type instance_type = instance_message.type if instance_type == message_pb2.TRUE_FALSE: text = instance_message.question instance = TrueFalseInstance(text, None, None) elif instance_type == message_pb2.MULTIPLE_TRUE_FALSE: options = [] for instance in instance_message.contained_instances: options.append(self.read_instance_message(instance)) instance = MultipleTrueFalseInstance(options) elif instance_type == message_pb2.QUESTION_ANSWER: question = instance_message.question options = instance_message.answer_options instance = QuestionAnswerInstance(question, options, None, None) elif instance_type == message_pb2.CHARACTER_SPAN: question = instance_message.question passage = instance_message.passage instance = CharacterSpanInstance(question, passage, None, None) else: raise RuntimeError("Unrecognized instance type: " + instance_type) if instance_message.background_instances: background = instance_message.background_instances background_instances = [ self.read_instance_message(instance) for instance in background ] instance = BackgroundInstance(instance, background_instances) return instance
def test_read_from_line_handles_two_column_with_default_false(self): index = 23 text = "this is a sentence" label = None line = self.instance_to_line(text, label, index) instance = TrueFalseInstance.read_from_line(line, default_label=False) assert instance.text == text assert instance.label is False assert instance.index == index
def test_read_from_line_handles_two_column_with_label(self): index = None text = "this is a sentence" label = True line = self.instance_to_line(text, label, index) instance = TrueFalseInstance.read_from_line(line) assert instance.text == text assert instance.label is label assert instance.index == index
def test_get_nearest_neighbors_does_not_crash(self): args = { 'corpus_path': self.corpus_path, 'model_serialization_prefix': './', 'max_sentence_length': 5, } model = self.get_model(DifferentiableSearchMemoryNetwork, args) model.encoder_model = FakeEncoder() model._initialize_lsh() model.max_sentence_length = 5 model.max_knowledge_length = 2 model.get_nearest_neighbors(TrueFalseInstance("this is a sentence", True))
def test_fit_word_dictionary_respects_min_count(self): instance = TrueFalseInstance("a a a a b b c c c", True) dataset = TextDataset([instance]) data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=4) assert 'a' in data_indexer.words_in_index() assert 'b' not in data_indexer.words_in_index() assert 'c' not in data_indexer.words_in_index() data_indexer = DataIndexer() data_indexer.fit_word_dictionary(dataset, min_count=1) assert 'a' in data_indexer.words_in_index() assert 'b' in data_indexer.words_in_index() assert 'c' in data_indexer.words_in_index()
def test_read_from_line_handles_one_column(self): text = "this is a sentence" instance = TrueFalseInstance.read_from_line(text) assert instance.text == text assert instance.label is None assert instance.index is None