def test_dataset_merging_with_different_types_of_headers(self): data_matrix_1 = [[1, 2], [3, 4]] dataset_1 = Dataset(data_matrix_1, ["h0", "h1"]) data_matrix_2 = [[5, 6], [7, 8]] dataset_2 = Dataset(data_matrix_2, None) with self.assertRaises(ValueError): merged_dataset = self.transformer.merge_datasets( [dataset_1, dataset_2])
def test_dataset_merging_with_different_number_of_rows(self): data_matrix_1 = [[1, 2], [3, 4], [10, 11]] dataset_1 = Dataset(data_matrix_1, ["h0", "h1"]) data_matrix_2 = [[5, 6], [7, 8]] dataset_2 = Dataset(data_matrix_2, ["h2", "h3"]) with self.assertRaises(ValueError): merged_dataset = self.transformer.merge_datasets( [dataset_1, dataset_2])
def setUp(self): data_matrix = [[1, 2, 3], [2, 3, 4], [5, 6, 7]] headers = ["column_0", "column_1", "column_2"] self.header_dataset = Dataset(data_matrix, headers) self.nonheader_dataset = Dataset(data_matrix) self.headered_dependent_variable = DatasetVariable("column_0") self.nonheadered_dependent_variable = DatasetVariable(0)
def test_dataset_merging_without_headers(self): data_matrix_1 = [[1, 2], [3, 4]] data_matrix_2 = [[5, 6], [7, 8]] merged_dataset = self.transformer.merge_datasets( [Dataset(data_matrix_1), Dataset(data_matrix_2)]) self.assertEqual(2, merged_dataset.num_rows) self.assertEqual(4, merged_dataset.num_cols) self.assertListEqual([1, 2, 5, 6], merged_dataset.data_matrix[0]) self.assertListEqual([3, 4, 7, 8], merged_dataset.data_matrix[1]) self.assertEqual(None, merged_dataset.headers)
def test_dataset_merging_with_headers(self): data_matrix_1 = [[1, 2], [3, 4]] dataset_1 = Dataset(data_matrix_1, ["h0", "h1"]) data_matrix_2 = [[5, 6], [7, 8]] dataset_2 = Dataset(data_matrix_2, ["h2", "h3"]) merged_dataset = self.transformer.merge_datasets( [dataset_1, dataset_2]) self.assertEqual(2, merged_dataset.num_rows) self.assertEqual(4, merged_dataset.num_cols) self.assertListEqual([1, 2, 5, 6], merged_dataset.data_matrix[0]) self.assertListEqual([3, 4, 7, 8], merged_dataset.data_matrix[1]) self.assertListEqual(["h0", "h1", "h2", "h3"], merged_dataset.headers)
def setUp(self): data_matrix = [[1, 2, 3, "a"], [2, 3, 2, "b"], [3, 2, 1, "a"], [5, 5, 1, "c"], [2, 2, 2, "a"]] self.dataset = Dataset(data_matrix) self.settings = AbstractSettings({}) self.parameter_set = ParameterSet({}) self.dependent_variable = DatasetVariable(0) self.independent_variables = [ DatasetVariable(1), DatasetVariable(2), DatasetVariable(3) ] self.sklearn_model = SklearnModel(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables)
def test_header_transformation_for_dataset_without_headers(self): data_matrix = [[1, 2, 3], [4, 5, 6]] dataset = Dataset(data_matrix) variable = DatasetVariable(0) header = self.dataset_transformation.get_transformed_header( dataset, variable) self.assertEqual(None, header)
def test_identity_transform(self): data_matrix = [[1, 2, 3], [4, 5, 6]] dataset = Dataset(data_matrix) transformed_dataset = self.dataset_transformation.transform(dataset) transformed_matrix, transformed_headers = transformed_dataset.data_matrix, transformed_dataset.headers self.assertEqual(2, len(transformed_matrix)) self.assertListEqual([1, 2, 3], transformed_matrix[0]) self.assertListEqual([4, 5, 6], transformed_matrix[1]) self.assertEqual(None, transformed_headers)
def setUp(self): settings = AbstractSettings( {"optimization_algorithm.population_size": 5}) predictive_model_generator = PredictiveModelGenerator(settings) predictive_model_generator.add_model_type(FakePredictiveModel) data_matrix = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [0, 1, 3, 4]] dataset = Dataset(data_matrix) dependent_variable = DatasetVariable(0) optimization_algorithm = OptimizationAlgorithm( dataset, dependent_variable, settings, predictive_model_generator) optimization_algorithm.initialize_population() self.model_population = optimization_algorithm.model_population
def test_training_decision_tree_on_simple_dataset(self): settings = AbstractSettings({}) dependent_variable = DatasetVariable(0) independent_variables = [DatasetVariable(1)] regression = RandomForestRegression(settings, self.parameter_set, dependent_variable, independent_variables) data_matrix = [[1,1], [2,2], [3,3], [4,4]] dataset = Dataset(data_matrix) trained = regression.train(dataset) array = trained.predict(dataset) self.assertEqual(4, len(array))
def test_using_transformer_on_single_transformation(self): data_matrix = [[1, 2, "hi"], [2, 3, "bye"]] dataset = Dataset(data_matrix) transformations = [IdentityTransformation] transformer = DatasetTransformer(self.settings, transformations) result = transformer.transform(dataset) result_matrix = result.data_matrix self.assertEqual(2, result.num_rows) self.assertEqual(6, result.num_cols) self.assertListEqual([1, 2, "hi", 1, 2, "hi"], result_matrix[0]) self.assertListEqual([2, 3, "bye", 2, 3, "bye"], result_matrix[1]) self.assertEqual(None, result.headers)
def setUp(self): data_matrix = [ [1,2,3,"a"], [2,3,2,"b"], [3,2,1,"a"], [5,5,1,"c"], [2,2,2,"a"]] self.dataset = Dataset(data_matrix) self.settings = AbstractSettings({}) self.parameter_set = ParameterSet({}) self.dependent_variable = DatasetVariable(0) self.independent_variables = [DatasetVariable(1), DatasetVariable(2), DatasetVariable(3)] self.sklearn_model = SklearnModel(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables)
def test_header_transformation_for_dataset_with_headers(self): data_matrix = [[1, 2, 3], [4, 5, 6]] headers = ["h0", "h1", "h2"] dataset = Dataset(data_matrix, headers) variable = DatasetVariable(0) header = self.dataset_transformation.get_transformed_header( dataset, variable) self.assertEqual("identitytransformation_h0", header) variable = DatasetVariable("h2") header = self.dataset_transformation.get_transformed_header( dataset, variable) self.assertEqual("identitytransformation_h2", header)
def test_identity_transform_with_headers(self): data_matrix = [[1, 2, 3], [4, 5, 6]] headers = ["h0", "h1", "h2"] dataset = Dataset(data_matrix, headers) transformed_dataset = self.dataset_transformation.transform(dataset) transformed_matrix, transformed_headers = transformed_dataset.data_matrix, transformed_dataset.headers self.assertEqual(2, len(transformed_matrix)) self.assertListEqual([1, 2, 3], transformed_matrix[0]) self.assertListEqual([4, 5, 6], transformed_matrix[1]) self.assertEqual(3, len(transformed_headers)) self.assertEqual("identitytransformation_h0", transformed_headers[0]) self.assertEqual("identitytransformation_h1", transformed_headers[1]) self.assertEqual("identitytransformation_h2", transformed_headers[2])
def test_training_ridge_on_simple_dataset(self): settings = {} dependent_variable = DatasetVariable(0) independent_variables = [DatasetVariable(1)] regression = LassoRegression(settings, self.parameter_set, dependent_variable, independent_variables) data_matrix = [[1, 1], [2, 2], [3, 3], [4, 4]] dataset = Dataset(data_matrix) trained = regression.train(dataset) array = trained.predict(dataset) self.assertAlmostEqual(1.6, array[0]) self.assertAlmostEqual(2.2, array[1]) self.assertAlmostEqual(2.8, array[2]) self.assertAlmostEqual(3.4, array[3])
def transform(self, dataset, variables=None): if variables == None: variables = [DatasetVariable(i) for i in xrange(dataset.num_cols)] filtered_matrix = dataset.get_filtered_matrix(variables) filtered_data_types = dataset.get_filtered_data_types(variables) num_cols = len(filtered_matrix[0]) transformed_columns = [] for j in xrange(num_cols): if filtered_data_types[j].data_type in self.valid_data_types(): self.transform_and_append_column(j, filtered_matrix, transformed_columns) if len(transformed_columns) > 0: data_matrix = self.rotate_matrix(transformed_columns) headers = self.get_transformed_headers(dataset, variables) return Dataset(data_matrix, headers)
def test_using_transformer_on_data_matrix_with_headers(self): data_matrix = [[1, 2, "hi"], [2, 3, "bye"]] dataset = Dataset(data_matrix, ["h0", "h1", "h2"]) transformations = [IdentityTransformation] transformer = DatasetTransformer(self.settings, transformations) result = transformer.transform(dataset) result_matrix = result.data_matrix self.assertEqual(2, result.num_rows) self.assertEqual(6, result.num_cols) self.assertListEqual([1, 2, "hi", 1, 2, "hi"], result_matrix[0]) self.assertListEqual([2, 3, "bye", 2, 3, "bye"], result_matrix[1]) self.assertEqual(6, len(result.headers)) self.assertListEqual([ "h0", "h1", "h2", "identitytransformation_h0", "identitytransformation_h1", "identitytransformation_h2" ], result.headers)
class SklearnModelTest(TestCase): def setUp(self): data_matrix = [[1, 2, 3, "a"], [2, 3, 2, "b"], [3, 2, 1, "a"], [5, 5, 1, "c"], [2, 2, 2, "a"]] self.dataset = Dataset(data_matrix) self.settings = AbstractSettings({}) self.parameter_set = ParameterSet({}) self.dependent_variable = DatasetVariable(0) self.independent_variables = [ DatasetVariable(1), DatasetVariable(2), DatasetVariable(3) ] self.sklearn_model = SklearnModel(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables) def test_categorical_independent_variables(self): independent_variable_data = self.sklearn_model.get_independent_variable_data( self.dataset) self.assertEqual(5, len(independent_variable_data)) num_categories = len(independent_variable_data[0]) self.assertEqual(5, num_categories) def test_dependent_variable_data(self): dependent_variable_data = self.sklearn_model.get_dependent_variable_data( self.dataset) self.assertEqual(5, len(dependent_variable_data)) self.assertListEqual([1, 2, 3, 5, 2], dependent_variable_data) def test_training_and_predicting_using_ols_regression(self): ols_regression = OLSLinearRegression(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables) for training_dataset, test_dataset in self.dataset.crossfold_partitions( 2): trained_model = ols_regression.train(training_dataset) predicted_result = trained_model.predict(test_dataset) self.assertEqual(test_dataset.num_rows, len(predicted_result))
def test_training_ols_on_simple_dataset(self): settings = {} parameter_set = {} dependent_variable = DatasetVariable(0) independent_variables = [DatasetVariable(1)] regression = OLSLinearRegression(settings, parameter_set, dependent_variable, independent_variables) data_matrix = [[1, 1], [2, 2], [3, 3], [4, 4]] dataset = Dataset(data_matrix) trained = regression.train(dataset) array = trained.predict(dataset) self.assertEqual(1, array[0]) self.assertEqual(2, array[1]) self.assertEqual(3, array[2]) self.assertEqual(4, array[3])
def read(self, maximum_size=None, delimiter=",", quoting=csv.QUOTE_NONE): if maximum_size == None: maximum_size = self.settings.get("dataset.maximum_dataset_size") with open(self.dataset_filename, 'rb') as f: reader = csv.reader(f, delimiter=delimiter, quoting=quoting) if self.settings.get("dataset.randomize_file_reader"): data_matrix = self.randomized_read_lines( csv_reader, maximum_size) else: data_matrix = self.greedy_read_lines(reader, maximum_size) data_matrix, headers = self.detect_headers(data_matrix) data_types = DataTypeClassification.classify_data_matrix(data_matrix) data_matrix = DatasetCleaner(self.settings, data_matrix, headers).clean() self.logger.info("Read dataset from file: '%s'", self.dataset_filename) self.logger.info("Headers: %s", headers) self.logger.info("Dataset Size: %s", len(data_matrix)) return Dataset(data_matrix, headers=headers, data_types=data_types)
def setUp(self): data_matrix = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [0, 1, 3, 4]] dataset = Dataset(data_matrix) dependent_variable = DatasetVariable(0) settings = AbstractSettings({ "differential_evolution.crossover_probability": 1.0, "differential_evolution.differential_weight": 1.0, "optimization_algorithm.population_size": 5, "independent_variable_selection.initial_independent_variables_percentage": 1.0 }) predictive_model_generator = PredictiveModelGenerator(settings) predictive_model_generator.add_model_type(FakePredictiveModel) self.optimization_algorithm = OptimizationAlgorithm( dataset, dependent_variable, settings, predictive_model_generator)
def merge_datasets(self, datasets): if len(datasets) < 1: raise ValueError("Must specify at least one dataset to merge.") first_dataset = datasets[0] num_rows = first_dataset.num_rows data_matrix = [] for i in xrange(num_rows): data_matrix.append(list(first_dataset.get_row(i))) if first_dataset.headers == None: headers = None else: headers = list(first_dataset.headers) for dataset in datasets[1:]: headers = self._merge_headers(headers, dataset) data_matrix = self._merge_data_matrix(data_matrix, dataset, num_rows) return Dataset(data_matrix, headers)
class SklearnModelTest(TestCase): def setUp(self): data_matrix = [ [1,2,3,"a"], [2,3,2,"b"], [3,2,1,"a"], [5,5,1,"c"], [2,2,2,"a"]] self.dataset = Dataset(data_matrix) self.settings = AbstractSettings({}) self.parameter_set = ParameterSet({}) self.dependent_variable = DatasetVariable(0) self.independent_variables = [DatasetVariable(1), DatasetVariable(2), DatasetVariable(3)] self.sklearn_model = SklearnModel(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables) def test_categorical_independent_variables(self): independent_variable_data = self.sklearn_model.get_independent_variable_data(self.dataset) self.assertEqual(5, len(independent_variable_data)) num_categories = len(independent_variable_data[0]) self.assertEqual(5, num_categories) def test_dependent_variable_data(self): dependent_variable_data = self.sklearn_model.get_dependent_variable_data(self.dataset) self.assertEqual(5, len(dependent_variable_data)) self.assertListEqual([1,2,3,5,2], dependent_variable_data) def test_training_and_predicting_using_ols_regression(self): ols_regression = OLSLinearRegression(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables) for training_dataset, test_dataset in self.dataset.crossfold_partitions(2): trained_model = ols_regression.train(training_dataset) predicted_result = trained_model.predict(test_dataset) self.assertEqual(test_dataset.num_rows, len(predicted_result))
def setUp(self): data_matrix = [[0, "detective", "book"], [1, "pablo", "escobar"], [2, "african", "queen"], [3, "applause", "movie"]] headers = ["number", "string", "another_string"] self.dataset = Dataset(data_matrix, headers)
class IndependentVariableSelectionTest(TestCase): def setUp(self): data_matrix = [[1, 2, 3], [2, 3, 4], [5, 6, 7]] headers = ["column_0", "column_1", "column_2"] self.header_dataset = Dataset(data_matrix, headers) self.nonheader_dataset = Dataset(data_matrix) self.headered_dependent_variable = DatasetVariable("column_0") self.nonheadered_dependent_variable = DatasetVariable(0) def test_initializing_small_dataset_with_header(self): settings = AbstractSettings() independent_variables = self.header_dataset.get_independent_variables( self.headered_dependent_variable) selection = IndependentVariableSelection( settings, self.headered_dependent_variable, independent_variables) variables = selection.initialize_independent_variables(2) headers = [var.variable for var in variables] self.assertIn("column_1", headers) self.assertIn("column_2", headers) def test_initializaing_small_dataset_without_header(self): settings = AbstractSettings() independent_variables = self.nonheader_dataset.get_independent_variables( self.nonheadered_dependent_variable) selection = IndependentVariableSelection( settings, self.nonheadered_dependent_variable, independent_variables) variables = selection.initialize_independent_variables(2) headers = [var.variable for var in variables] self.assertIn(1, headers) self.assertIn(2, headers) def test_getting_probability_of_variables(self): settings = AbstractSettings() independent_variables = self.header_dataset.get_independent_variables( self.headered_dependent_variable) selection = IndependentVariableSelection( settings, self.headered_dependent_variable, independent_variables) self.assertAlmostEqual( 0.5, selection.get_probability(DatasetVariable("column_1"))) self.assertAlmostEqual( 0.5, selection.get_probability(DatasetVariable("column_2"))) self.assertAlmostEqual(0.5, selection.get_probability("column_1")) self.assertAlmostEqual(0.5, selection.get_probability("column_2")) def test_increasing_probability_of_variables(self): settings = AbstractSettings() independent_variables = self.header_dataset.get_independent_variables( self.headered_dependent_variable) selection = IndependentVariableSelection( settings, self.headered_dependent_variable, independent_variables) selection.increase_probability(DatasetVariable("column_1")) self.assertLess(0.5, selection.get_probability(DatasetVariable("column_1"))) self.assertGreater( 0.5, selection.get_probability(DatasetVariable("column_2")))
class DatasetTest(TestCase): def setUp(self): data_matrix = [[0, "detective", "book"], [1, "pablo", "escobar"], [2, "african", "queen"], [3, "applause", "movie"]] headers = ["number", "string", "another_string"] self.dataset = Dataset(data_matrix, headers) def test_data_types_and_categories_when_initializing_dataset(self): self.assertEqual(3, len(self.dataset.data_types)) self.assertEqual("integer", self.dataset.data_types[0].data_type) self.assertEqual("string", self.dataset.data_types[1].data_type) self.assertEqual("string", self.dataset.data_types[2].data_type) self.assertEqual(4, len(self.dataset.data_types[1].categories)) self.assertTrue(self.dataset.data_types[1].in_categories("detective")) self.assertTrue(self.dataset.data_types[1].in_categories("pablo")) self.assertTrue(self.dataset.data_types[1].in_categories("african")) self.assertTrue(self.dataset.data_types[1].in_categories("applause")) self.assertEqual(4, len(self.dataset.data_types[2].categories)) self.assertTrue(self.dataset.data_types[2].in_categories("book")) self.assertTrue(self.dataset.data_types[2].in_categories("escobar")) self.assertTrue(self.dataset.data_types[2].in_categories("queen")) self.assertTrue(self.dataset.data_types[2].in_categories("movie")) def test_getting_independent_variables(self): dependent_variable_index = DatasetVariable(0) dependent_variable_header = DatasetVariable("number") independent_variables = self.dataset.get_independent_variables( dependent_variable_index) self.assertEqual(2, len(independent_variables)) for variable in independent_variables: self.assertIn(variable.variable, ["string", "another_string"]) independent_variables = self.dataset.get_independent_variables( dependent_variable_header) self.assertEqual(2, len(independent_variables)) for variable in independent_variables: self.assertIn(variable.variable, ["string", "another_string"]) def test_crossfold_partitions(self): past_test_datasets = [] for training_dataset, test_dataset in self.dataset.crossfold_partitions( 4): self.assertEqual(3, training_dataset.num_rows) self.assertEqual(1, test_dataset.num_rows) self.assertNotIn(test_dataset.get(0, 0), past_test_datasets) self.assertListEqual(["number", "string", "another_string"], training_dataset.headers) self.assertListEqual(["number", "string", "another_string"], test_dataset.headers) past_test_datasets.append(test_dataset.get(0, 0)) for training_dataset, test_dataset in self.dataset.crossfold_partitions( 2): self.assertEqual(2, training_dataset.num_rows) self.assertEqual(2, test_dataset.num_rows) self.assertListEqual(["number", "string", "another_string"], training_dataset.headers) self.assertListEqual(["number", "string", "another_string"], test_dataset.headers) def test_get_filtered_matrix(self): variables = [DatasetVariable(0), DatasetVariable("string")] filtered_matrix = self.dataset.get_filtered_matrix(variables) self.assertEqual(4, len(filtered_matrix)) for i in xrange(len(filtered_matrix)): self.assertEqual(2, len(filtered_matrix[i])) self.assertListEqual([0, "detective"], filtered_matrix[0]) self.assertListEqual([1, "pablo"], filtered_matrix[1]) self.assertListEqual([2, "african"], filtered_matrix[2]) self.assertListEqual([3, "applause"], filtered_matrix[3]) def test_getting_filtered_matrix_with_no_variables(self): variables = [] with self.assertRaises(ValueError): filtered_matrix = self.dataset.get_filtered_matrix(variables)