Exemplo n.º 1
0
 def test_to_indexed_feature_converts_correctly(self):
     feature = PairFeature("What do dogs eat?",
                           "What do cats eat, play with, or enjoy?", 0)
     data_indexer = DataIndexer()
     what_index = data_indexer.add_word_to_index("what")
     do_index = data_indexer.add_word_to_index("do")
     dogs_index = data_indexer.add_word_to_index("dogs")
     eat_index = data_indexer.add_word_to_index("eat")
     cats_index = data_indexer.add_word_to_index("cats")
     question_index = data_indexer.add_word_to_index("?")
     comma_index = data_indexer.add_word_to_index(",")
     play_index = data_indexer.add_word_to_index("play")
     with_index = data_indexer.add_word_to_index("with")
     or_index = data_indexer.add_word_to_index("or")
     enjoy_index = data_indexer.add_word_to_index("enjoy")
     idxd_feature = feature.to_indexed_feature(data_indexer)
     first_sent_word_idxs, second_sent_word_idxs = idxd_feature.get_int_word_indices(
     )
     assert first_sent_word_idxs == [
         what_index, do_index, dogs_index, eat_index, question_index
     ]
     assert second_sent_word_idxs == [
         what_index, do_index, cats_index, eat_index, comma_index,
         play_index, with_index, comma_index, or_index, enjoy_index,
         question_index
     ]
     assert idxd_feature.label == [1, 0]
Exemplo n.º 2
0
 def test_read_from_line_handles_train_example(self):
     question1 = "Does he enjoy playing soccer in the rain?"
     question2 = "Does he enjoy coding in the rain?"
     id = 0
     qid1 = 0
     qid2 = 1
     label = 0
     line = self.feature_to_line(id, question1, question2, label, qid1,
                                 qid2)
     feature = PairFeature.read_from_line(line)
     assert feature.first_sentence_str == question1
     expected_first_sentence_words = [
         "does", "he", "enjoy", "playing", "soccer", "in", "the", "rain",
         "?"
     ]
     expected_first_sentence_chars = list(
         map(list, expected_first_sentence_words))
     assert feature.first_sentence_tokenized == {
         "words": expected_first_sentence_words,
         "characters": expected_first_sentence_chars
     }
     expected_second_sentence_words = [
         "does", "he", "enjoy", "coding", "in", "the", "rain", "?"
     ]
     expected_second_sentence_chars = list(
         map(list, expected_second_sentence_words))
     assert feature.second_sentence_tokenized == {
         "words": expected_second_sentence_words,
         "characters": expected_second_sentence_chars
     }
     assert feature.second_sentence_str == question2
     assert feature.label == 0
     with self.assertRaises(RuntimeError):
         PairFeature.read_from_line("This is not a proper line.")
Exemplo n.º 3
0
 def test_word_tokenizer_tokenizes_the_sentence_correctly(self):
     feature = PairFeature("One sentence.", "A two sentence.", Tokenizer)
     assert feature.words() == {
         "words": ["one", "sentence", ".", "a", "two", "sentence", "."],
         "characters": [
             'o', 'n', 'e', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.',
             'a', 't', 'w', 'o', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.'
         ]
     }
Exemplo n.º 4
0
 def test_merge(self):
     features = [PairFeature("testing1", "test1", None),
                  PairFeature("testing2", "test2", None)]
     dataset1 = Dataset(features[:1])
     dataset2 = Dataset(features[1:])
     merged = dataset1.merge(dataset2)
     assert merged.features == features
     with self.assertRaises(ValueError):
         merged = dataset1.merge(features)
Exemplo n.º 5
0
 def test_truncate(self):
     features = [PairFeature("testing1", "test1", None),
                  PairFeature("testing2", "test2", None)]
     dataset = Dataset(features)
     truncated = dataset.truncate(1)
     assert len(truncated.features) == 1
     with self.assertRaises(ValueError):
         truncated = dataset.truncate("1")
     with self.assertRaises(ValueError):
         truncated = dataset.truncate(0)
Exemplo n.º 6
0
 def test_words_tokenizes_the_sentence_correctly(self):
     pair_feature = PairFeature("A sentence.", "Another sentence.", 0)
     expected_words = ["a", "sentence", ".", "another", "sentence", "."]
     expected_characters = [
         'a', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.', 'a', 'n', 'o',
         't', 'h', 'e', 'r', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', '.'
     ]
     assert pair_feature.words() == {
         "words": expected_words,
         "characters": expected_characters
     }
Exemplo n.º 7
0
 def test_read_from_line_handles_test_example(self):
     question1 = "Does he enjoy playing soccer in the rain?"
     question2 = "Does he enjoy coding in the rain?"
     id = 0
     line = self.feature_to_line(id, question1, question2)
     feature = PairFeature.read_from_line(line)
     assert feature.first_sentence_str == question1
     expected_first_sentence_words = [
         "does", "he", "enjoy", "playing", "soccer", "in", "the", "rain",
         "?"
     ]
     expected_first_sentence_chars = list(
         map(list, expected_first_sentence_words))
     assert feature.first_sentence_tokenized == {
         "words": expected_first_sentence_words,
         "characters": expected_first_sentence_chars
     }
     expected_second_sentence_words = [
         "does", "he", "enjoy", "coding", "in", "the", "rain", "?"
     ]
     expected_second_sentence_chars = list(
         map(list, expected_second_sentence_words))
     assert feature.second_sentence_tokenized == {
         "words": expected_second_sentence_words,
         "characters": expected_second_sentence_chars
     }
     assert feature.second_sentence_str == question2
     assert feature.label is None
Exemplo n.º 8
0
    def test_to_indexed_dataset(self):
        features = [PairFeature("testing1 test1", "test1", None),
                     PairFeature("testing2", "test2 testing1", None)]
        data_indexer = DataIndexer()
        testing1_index = data_indexer.add_word_to_index("testing1")
        test1_index = data_indexer.add_word_to_index("test1")
        testing2_index = data_indexer.add_word_to_index("testing2")
        test2_index = data_indexer.add_word_to_index("test2")
        dataset = TextDataset(features)
        indexed_dataset = dataset.to_indexed_dataset(data_indexer)

        indexed_feature = indexed_dataset.features[0]
        first_sent_idxs, second_sent_idxs = indexed_feature.get_int_word_indices()
        assert first_sent_idxs == [testing1_index,
                                   test1_index]
        assert second_sent_idxs == [test1_index]

        indexed_feature = indexed_dataset.features[1]
        first_sent_idxs, second_sent_idxs = indexed_feature.get_int_word_indices()
        assert first_sent_idxs == [testing2_index]
        assert second_sent_idxs == [test2_index,
                                    testing1_index]
Exemplo n.º 9
0
    def test_fit_word_dictionary_respects_min_count(self):
        feature = PairFeature("a a a a b", "b c c c", 1)
        dataset = TextDataset([feature])
        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=4)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' not in data_indexer.words_in_index()
        assert 'c' not in data_indexer.words_in_index()

        data_indexer = DataIndexer()
        data_indexer.fit_word_dictionary(dataset, min_count=1)
        assert 'a' in data_indexer.words_in_index()
        assert 'b' in data_indexer.words_in_index()
        assert 'c' in data_indexer.words_in_index()
Exemplo n.º 10
0
 def test_exceptions(self):
     data_indexer = DataIndexer()
     feature = PairFeature("a a a a b", "b c c c", 1)
     dataset = TextDataset([feature])
     with self.assertRaises(ValueError):
         data_indexer.fit_word_dictionary(dataset, "3")
     with self.assertRaises(ValueError):
         data_indexer.fit_word_dictionary("not a dataset", 3)
     with self.assertRaises(ValueError):
         data_indexer.add_word_to_index(3)
     with self.assertRaises(ValueError):
         data_indexer.get_word_index(3)
     with self.assertRaises(ValueError):
         data_indexer.get_word_from_index("3")
Exemplo n.º 11
0
 def test_exceptions(self):
     feature = PairFeature("testing1", "test1", 0)
     with self.assertRaises(ValueError):
         Dataset(feature)
     with self.assertRaises(ValueError):
         Dataset(["not an feature"])