예제 #1
0
class DatasetTest(TestCase):
    def setUp(self):
        data_matrix = [[0, "detective", "book"], [1, "pablo", "escobar"],
                       [2, "african", "queen"], [3, "applause", "movie"]]
        headers = ["number", "string", "another_string"]
        self.dataset = Dataset(data_matrix, headers)

    def test_data_types_and_categories_when_initializing_dataset(self):
        self.assertEqual(3, len(self.dataset.data_types))
        self.assertEqual("integer", self.dataset.data_types[0].data_type)
        self.assertEqual("string", self.dataset.data_types[1].data_type)
        self.assertEqual("string", self.dataset.data_types[2].data_type)

        self.assertEqual(4, len(self.dataset.data_types[1].categories))
        self.assertTrue(self.dataset.data_types[1].in_categories("detective"))
        self.assertTrue(self.dataset.data_types[1].in_categories("pablo"))
        self.assertTrue(self.dataset.data_types[1].in_categories("african"))
        self.assertTrue(self.dataset.data_types[1].in_categories("applause"))

        self.assertEqual(4, len(self.dataset.data_types[2].categories))
        self.assertTrue(self.dataset.data_types[2].in_categories("book"))
        self.assertTrue(self.dataset.data_types[2].in_categories("escobar"))
        self.assertTrue(self.dataset.data_types[2].in_categories("queen"))
        self.assertTrue(self.dataset.data_types[2].in_categories("movie"))

    def test_getting_independent_variables(self):
        dependent_variable_index = DatasetVariable(0)
        dependent_variable_header = DatasetVariable("number")

        independent_variables = self.dataset.get_independent_variables(
            dependent_variable_index)
        self.assertEqual(2, len(independent_variables))
        for variable in independent_variables:
            self.assertIn(variable.variable, ["string", "another_string"])

        independent_variables = self.dataset.get_independent_variables(
            dependent_variable_header)
        self.assertEqual(2, len(independent_variables))
        for variable in independent_variables:
            self.assertIn(variable.variable, ["string", "another_string"])

    def test_crossfold_partitions(self):
        past_test_datasets = []
        for training_dataset, test_dataset in self.dataset.crossfold_partitions(
                4):
            self.assertEqual(3, training_dataset.num_rows)
            self.assertEqual(1, test_dataset.num_rows)
            self.assertNotIn(test_dataset.get(0, 0), past_test_datasets)
            self.assertListEqual(["number", "string", "another_string"],
                                 training_dataset.headers)
            self.assertListEqual(["number", "string", "another_string"],
                                 test_dataset.headers)

            past_test_datasets.append(test_dataset.get(0, 0))

        for training_dataset, test_dataset in self.dataset.crossfold_partitions(
                2):
            self.assertEqual(2, training_dataset.num_rows)
            self.assertEqual(2, test_dataset.num_rows)
            self.assertListEqual(["number", "string", "another_string"],
                                 training_dataset.headers)
            self.assertListEqual(["number", "string", "another_string"],
                                 test_dataset.headers)

    def test_get_filtered_matrix(self):
        variables = [DatasetVariable(0), DatasetVariable("string")]

        filtered_matrix = self.dataset.get_filtered_matrix(variables)
        self.assertEqual(4, len(filtered_matrix))
        for i in xrange(len(filtered_matrix)):
            self.assertEqual(2, len(filtered_matrix[i]))

        self.assertListEqual([0, "detective"], filtered_matrix[0])
        self.assertListEqual([1, "pablo"], filtered_matrix[1])
        self.assertListEqual([2, "african"], filtered_matrix[2])
        self.assertListEqual([3, "applause"], filtered_matrix[3])

    def test_getting_filtered_matrix_with_no_variables(self):
        variables = []

        with self.assertRaises(ValueError):
            filtered_matrix = self.dataset.get_filtered_matrix(variables)
예제 #2
0
class IndependentVariableSelectionTest(TestCase):
    def setUp(self):
        data_matrix = [[1, 2, 3], [2, 3, 4], [5, 6, 7]]
        headers = ["column_0", "column_1", "column_2"]
        self.header_dataset = Dataset(data_matrix, headers)
        self.nonheader_dataset = Dataset(data_matrix)

        self.headered_dependent_variable = DatasetVariable("column_0")
        self.nonheadered_dependent_variable = DatasetVariable(0)

    def test_initializing_small_dataset_with_header(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)

        variables = selection.initialize_independent_variables(2)
        headers = [var.variable for var in variables]
        self.assertIn("column_1", headers)
        self.assertIn("column_2", headers)

    def test_initializaing_small_dataset_without_header(self):
        settings = AbstractSettings()
        independent_variables = self.nonheader_dataset.get_independent_variables(
            self.nonheadered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.nonheadered_dependent_variable,
            independent_variables)

        variables = selection.initialize_independent_variables(2)
        headers = [var.variable for var in variables]
        self.assertIn(1, headers)
        self.assertIn(2, headers)

    def test_getting_probability_of_variables(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)

        self.assertAlmostEqual(
            0.5, selection.get_probability(DatasetVariable("column_1")))
        self.assertAlmostEqual(
            0.5, selection.get_probability(DatasetVariable("column_2")))

        self.assertAlmostEqual(0.5, selection.get_probability("column_1"))
        self.assertAlmostEqual(0.5, selection.get_probability("column_2"))

    def test_increasing_probability_of_variables(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)
        selection.increase_probability(DatasetVariable("column_1"))

        self.assertLess(0.5,
                        selection.get_probability(DatasetVariable("column_1")))
        self.assertGreater(
            0.5, selection.get_probability(DatasetVariable("column_2")))