def test_indexed_instance_padding(self):
        data_indexer = DataIndexer()
        dataset = TextDataset([self.instance])
        data_indexer.fit_word_dictionary(dataset)

        indexed = self.instance.to_indexed_instance(data_indexer)
        num_question_tuples = 1
        num_background_tuples = 4
        num_slots = 3
        slot_length = 6
        num_options = 4
        padding_lengths = {
            'num_question_tuples': num_question_tuples,
            'num_background_tuples': num_background_tuples,
            'num_slots': num_slots,
            'num_sentence_words': slot_length,
            'num_options': num_options
        }
        indexed.pad(padding_lengths)
        assert len(indexed.answers_indexed) == num_options
        for answer_option_tuples in indexed.answers_indexed:
            assert len(answer_option_tuples) == num_question_tuples
            for ans_tuple in answer_option_tuples:
                assert len(ans_tuple) == num_slots
                for slot in ans_tuple:
                    assert len(slot) == slot_length
        assert len(indexed.background_indexed) == num_background_tuples
        for background_tuple in indexed.background_indexed:
            assert len(background_tuple) == num_slots
            for slot in background_tuple:
                assert len(slot) == slot_length
Exemplo n.º 2
0
    def test_fit_word_dictionary_respects_min_count(self):
        instance = TextClassificationInstance("a a a a b b c c c", True)
        dataset = TextDataset([instance])
        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=4)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' not in data_indexer.words_in_index()
        assert 'c' not in data_indexer.words_in_index()

        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=1)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' in data_indexer.words_in_index()
        assert 'c' in data_indexer.words_in_index()