class DatasetTest(TestCase): def setUp(self): data_matrix = [[0, "detective", "book"], [1, "pablo", "escobar"], [2, "african", "queen"], [3, "applause", "movie"]] headers = ["number", "string", "another_string"] self.dataset = Dataset(data_matrix, headers) def test_data_types_and_categories_when_initializing_dataset(self): self.assertEqual(3, len(self.dataset.data_types)) self.assertEqual("integer", self.dataset.data_types[0].data_type) self.assertEqual("string", self.dataset.data_types[1].data_type) self.assertEqual("string", self.dataset.data_types[2].data_type) self.assertEqual(4, len(self.dataset.data_types[1].categories)) self.assertTrue(self.dataset.data_types[1].in_categories("detective")) self.assertTrue(self.dataset.data_types[1].in_categories("pablo")) self.assertTrue(self.dataset.data_types[1].in_categories("african")) self.assertTrue(self.dataset.data_types[1].in_categories("applause")) self.assertEqual(4, len(self.dataset.data_types[2].categories)) self.assertTrue(self.dataset.data_types[2].in_categories("book")) self.assertTrue(self.dataset.data_types[2].in_categories("escobar")) self.assertTrue(self.dataset.data_types[2].in_categories("queen")) self.assertTrue(self.dataset.data_types[2].in_categories("movie")) def test_getting_independent_variables(self): dependent_variable_index = DatasetVariable(0) dependent_variable_header = DatasetVariable("number") independent_variables = self.dataset.get_independent_variables( dependent_variable_index) self.assertEqual(2, len(independent_variables)) for variable in independent_variables: self.assertIn(variable.variable, ["string", "another_string"]) independent_variables = self.dataset.get_independent_variables( dependent_variable_header) self.assertEqual(2, len(independent_variables)) for variable in independent_variables: self.assertIn(variable.variable, ["string", "another_string"]) def test_crossfold_partitions(self): past_test_datasets = [] for training_dataset, test_dataset in self.dataset.crossfold_partitions( 4): self.assertEqual(3, training_dataset.num_rows) self.assertEqual(1, test_dataset.num_rows) self.assertNotIn(test_dataset.get(0, 0), past_test_datasets) self.assertListEqual(["number", "string", "another_string"], training_dataset.headers) self.assertListEqual(["number", "string", "another_string"], test_dataset.headers) past_test_datasets.append(test_dataset.get(0, 0)) for training_dataset, test_dataset in self.dataset.crossfold_partitions( 2): self.assertEqual(2, training_dataset.num_rows) self.assertEqual(2, test_dataset.num_rows) self.assertListEqual(["number", "string", "another_string"], training_dataset.headers) self.assertListEqual(["number", "string", "another_string"], test_dataset.headers) def test_get_filtered_matrix(self): variables = [DatasetVariable(0), DatasetVariable("string")] filtered_matrix = self.dataset.get_filtered_matrix(variables) self.assertEqual(4, len(filtered_matrix)) for i in xrange(len(filtered_matrix)): self.assertEqual(2, len(filtered_matrix[i])) self.assertListEqual([0, "detective"], filtered_matrix[0]) self.assertListEqual([1, "pablo"], filtered_matrix[1]) self.assertListEqual([2, "african"], filtered_matrix[2]) self.assertListEqual([3, "applause"], filtered_matrix[3]) def test_getting_filtered_matrix_with_no_variables(self): variables = [] with self.assertRaises(ValueError): filtered_matrix = self.dataset.get_filtered_matrix(variables)
class IndependentVariableSelectionTest(TestCase): def setUp(self): data_matrix = [[1, 2, 3], [2, 3, 4], [5, 6, 7]] headers = ["column_0", "column_1", "column_2"] self.header_dataset = Dataset(data_matrix, headers) self.nonheader_dataset = Dataset(data_matrix) self.headered_dependent_variable = DatasetVariable("column_0") self.nonheadered_dependent_variable = DatasetVariable(0) def test_initializing_small_dataset_with_header(self): settings = AbstractSettings() independent_variables = self.header_dataset.get_independent_variables( self.headered_dependent_variable) selection = IndependentVariableSelection( settings, self.headered_dependent_variable, independent_variables) variables = selection.initialize_independent_variables(2) headers = [var.variable for var in variables] self.assertIn("column_1", headers) self.assertIn("column_2", headers) def test_initializaing_small_dataset_without_header(self): settings = AbstractSettings() independent_variables = self.nonheader_dataset.get_independent_variables( self.nonheadered_dependent_variable) selection = IndependentVariableSelection( settings, self.nonheadered_dependent_variable, independent_variables) variables = selection.initialize_independent_variables(2) headers = [var.variable for var in variables] self.assertIn(1, headers) self.assertIn(2, headers) def test_getting_probability_of_variables(self): settings = AbstractSettings() independent_variables = self.header_dataset.get_independent_variables( self.headered_dependent_variable) selection = IndependentVariableSelection( settings, self.headered_dependent_variable, independent_variables) self.assertAlmostEqual( 0.5, selection.get_probability(DatasetVariable("column_1"))) self.assertAlmostEqual( 0.5, selection.get_probability(DatasetVariable("column_2"))) self.assertAlmostEqual(0.5, selection.get_probability("column_1")) self.assertAlmostEqual(0.5, selection.get_probability("column_2")) def test_increasing_probability_of_variables(self): settings = AbstractSettings() independent_variables = self.header_dataset.get_independent_variables( self.headered_dependent_variable) selection = IndependentVariableSelection( settings, self.headered_dependent_variable, independent_variables) selection.increase_probability(DatasetVariable("column_1")) self.assertLess(0.5, selection.get_probability(DatasetVariable("column_1"))) self.assertGreater( 0.5, selection.get_probability(DatasetVariable("column_2")))