Пример #1
0
 def test_dataset_merging_with_different_types_of_headers(self):
     data_matrix_1 = [[1, 2], [3, 4]]
     dataset_1 = Dataset(data_matrix_1, ["h0", "h1"])
     data_matrix_2 = [[5, 6], [7, 8]]
     dataset_2 = Dataset(data_matrix_2, None)
     with self.assertRaises(ValueError):
         merged_dataset = self.transformer.merge_datasets(
             [dataset_1, dataset_2])
Пример #2
0
 def test_dataset_merging_with_different_number_of_rows(self):
     data_matrix_1 = [[1, 2], [3, 4], [10, 11]]
     dataset_1 = Dataset(data_matrix_1, ["h0", "h1"])
     data_matrix_2 = [[5, 6], [7, 8]]
     dataset_2 = Dataset(data_matrix_2, ["h2", "h3"])
     with self.assertRaises(ValueError):
         merged_dataset = self.transformer.merge_datasets(
             [dataset_1, dataset_2])
Пример #3
0
    def setUp(self):
        data_matrix = [[1, 2, 3], [2, 3, 4], [5, 6, 7]]
        headers = ["column_0", "column_1", "column_2"]
        self.header_dataset = Dataset(data_matrix, headers)
        self.nonheader_dataset = Dataset(data_matrix)

        self.headered_dependent_variable = DatasetVariable("column_0")
        self.nonheadered_dependent_variable = DatasetVariable(0)
Пример #4
0
    def test_dataset_merging_without_headers(self):
        data_matrix_1 = [[1, 2], [3, 4]]
        data_matrix_2 = [[5, 6], [7, 8]]
        merged_dataset = self.transformer.merge_datasets(
            [Dataset(data_matrix_1),
             Dataset(data_matrix_2)])

        self.assertEqual(2, merged_dataset.num_rows)
        self.assertEqual(4, merged_dataset.num_cols)
        self.assertListEqual([1, 2, 5, 6], merged_dataset.data_matrix[0])
        self.assertListEqual([3, 4, 7, 8], merged_dataset.data_matrix[1])
        self.assertEqual(None, merged_dataset.headers)
Пример #5
0
    def test_dataset_merging_with_headers(self):
        data_matrix_1 = [[1, 2], [3, 4]]
        dataset_1 = Dataset(data_matrix_1, ["h0", "h1"])
        data_matrix_2 = [[5, 6], [7, 8]]
        dataset_2 = Dataset(data_matrix_2, ["h2", "h3"])
        merged_dataset = self.transformer.merge_datasets(
            [dataset_1, dataset_2])

        self.assertEqual(2, merged_dataset.num_rows)
        self.assertEqual(4, merged_dataset.num_cols)
        self.assertListEqual([1, 2, 5, 6], merged_dataset.data_matrix[0])
        self.assertListEqual([3, 4, 7, 8], merged_dataset.data_matrix[1])
        self.assertListEqual(["h0", "h1", "h2", "h3"], merged_dataset.headers)
Пример #6
0
    def setUp(self):
        data_matrix = [[1, 2, 3, "a"], [2, 3, 2, "b"], [3, 2, 1, "a"],
                       [5, 5, 1, "c"], [2, 2, 2, "a"]]
        self.dataset = Dataset(data_matrix)
        self.settings = AbstractSettings({})
        self.parameter_set = ParameterSet({})
        self.dependent_variable = DatasetVariable(0)
        self.independent_variables = [
            DatasetVariable(1),
            DatasetVariable(2),
            DatasetVariable(3)
        ]

        self.sklearn_model = SklearnModel(self.settings, self.parameter_set,
                                          self.dependent_variable,
                                          self.independent_variables)
Пример #7
0
    def test_header_transformation_for_dataset_without_headers(self):
        data_matrix = [[1, 2, 3], [4, 5, 6]]
        dataset = Dataset(data_matrix)
        variable = DatasetVariable(0)
        header = self.dataset_transformation.get_transformed_header(
            dataset, variable)

        self.assertEqual(None, header)
Пример #8
0
    def test_identity_transform(self):
        data_matrix = [[1, 2, 3], [4, 5, 6]]
        dataset = Dataset(data_matrix)

        transformed_dataset = self.dataset_transformation.transform(dataset)
        transformed_matrix, transformed_headers = transformed_dataset.data_matrix, transformed_dataset.headers
        self.assertEqual(2, len(transformed_matrix))
        self.assertListEqual([1, 2, 3], transformed_matrix[0])
        self.assertListEqual([4, 5, 6], transformed_matrix[1])
        self.assertEqual(None, transformed_headers)
    def setUp(self):
        settings = AbstractSettings(
            {"optimization_algorithm.population_size": 5})
        predictive_model_generator = PredictiveModelGenerator(settings)
        predictive_model_generator.add_model_type(FakePredictiveModel)
        data_matrix = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [0, 1, 3, 4]]
        dataset = Dataset(data_matrix)
        dependent_variable = DatasetVariable(0)

        optimization_algorithm = OptimizationAlgorithm(
            dataset, dependent_variable, settings, predictive_model_generator)
        optimization_algorithm.initialize_population()
        self.model_population = optimization_algorithm.model_population
Пример #10
0
    def test_training_decision_tree_on_simple_dataset(self):
        settings = AbstractSettings({})
        dependent_variable = DatasetVariable(0)
        independent_variables = [DatasetVariable(1)]
        regression = RandomForestRegression(settings, self.parameter_set, dependent_variable, independent_variables)

        data_matrix = [[1,1], [2,2], [3,3], [4,4]]
        dataset = Dataset(data_matrix)

        trained = regression.train(dataset)
        array = trained.predict(dataset)

        self.assertEqual(4, len(array))
Пример #11
0
    def test_using_transformer_on_single_transformation(self):
        data_matrix = [[1, 2, "hi"], [2, 3, "bye"]]
        dataset = Dataset(data_matrix)
        transformations = [IdentityTransformation]
        transformer = DatasetTransformer(self.settings, transformations)

        result = transformer.transform(dataset)
        result_matrix = result.data_matrix

        self.assertEqual(2, result.num_rows)
        self.assertEqual(6, result.num_cols)
        self.assertListEqual([1, 2, "hi", 1, 2, "hi"], result_matrix[0])
        self.assertListEqual([2, 3, "bye", 2, 3, "bye"], result_matrix[1])
        self.assertEqual(None, result.headers)
Пример #12
0
    def setUp(self):
        data_matrix = [
                [1,2,3,"a"],
                [2,3,2,"b"],
                [3,2,1,"a"],
                [5,5,1,"c"],
                [2,2,2,"a"]]
        self.dataset = Dataset(data_matrix)
        self.settings = AbstractSettings({})
        self.parameter_set = ParameterSet({})
        self.dependent_variable = DatasetVariable(0)
        self.independent_variables = [DatasetVariable(1), DatasetVariable(2), DatasetVariable(3)]

        self.sklearn_model = SklearnModel(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables)
Пример #13
0
    def test_header_transformation_for_dataset_with_headers(self):
        data_matrix = [[1, 2, 3], [4, 5, 6]]
        headers = ["h0", "h1", "h2"]
        dataset = Dataset(data_matrix, headers)

        variable = DatasetVariable(0)
        header = self.dataset_transformation.get_transformed_header(
            dataset, variable)
        self.assertEqual("identitytransformation_h0", header)

        variable = DatasetVariable("h2")
        header = self.dataset_transformation.get_transformed_header(
            dataset, variable)
        self.assertEqual("identitytransformation_h2", header)
Пример #14
0
    def test_identity_transform_with_headers(self):
        data_matrix = [[1, 2, 3], [4, 5, 6]]
        headers = ["h0", "h1", "h2"]
        dataset = Dataset(data_matrix, headers)

        transformed_dataset = self.dataset_transformation.transform(dataset)
        transformed_matrix, transformed_headers = transformed_dataset.data_matrix, transformed_dataset.headers
        self.assertEqual(2, len(transformed_matrix))
        self.assertListEqual([1, 2, 3], transformed_matrix[0])
        self.assertListEqual([4, 5, 6], transformed_matrix[1])

        self.assertEqual(3, len(transformed_headers))
        self.assertEqual("identitytransformation_h0", transformed_headers[0])
        self.assertEqual("identitytransformation_h1", transformed_headers[1])
        self.assertEqual("identitytransformation_h2", transformed_headers[2])
Пример #15
0
    def test_training_ridge_on_simple_dataset(self):
        settings = {}
        dependent_variable = DatasetVariable(0)
        independent_variables = [DatasetVariable(1)]
        regression = LassoRegression(settings, self.parameter_set,
                                     dependent_variable, independent_variables)

        data_matrix = [[1, 1], [2, 2], [3, 3], [4, 4]]
        dataset = Dataset(data_matrix)

        trained = regression.train(dataset)
        array = trained.predict(dataset)

        self.assertAlmostEqual(1.6, array[0])
        self.assertAlmostEqual(2.2, array[1])
        self.assertAlmostEqual(2.8, array[2])
        self.assertAlmostEqual(3.4, array[3])
Пример #16
0
    def transform(self, dataset, variables=None):
        if variables == None:
            variables = [DatasetVariable(i) for i in xrange(dataset.num_cols)]

        filtered_matrix = dataset.get_filtered_matrix(variables)
        filtered_data_types = dataset.get_filtered_data_types(variables)
        num_cols = len(filtered_matrix[0])
        transformed_columns = []
        for j in xrange(num_cols):
            if filtered_data_types[j].data_type in self.valid_data_types():
                self.transform_and_append_column(j, filtered_matrix,
                                                 transformed_columns)

        if len(transformed_columns) > 0:
            data_matrix = self.rotate_matrix(transformed_columns)
            headers = self.get_transformed_headers(dataset, variables)
            return Dataset(data_matrix, headers)
Пример #17
0
    def test_using_transformer_on_data_matrix_with_headers(self):
        data_matrix = [[1, 2, "hi"], [2, 3, "bye"]]
        dataset = Dataset(data_matrix, ["h0", "h1", "h2"])
        transformations = [IdentityTransformation]
        transformer = DatasetTransformer(self.settings, transformations)

        result = transformer.transform(dataset)
        result_matrix = result.data_matrix

        self.assertEqual(2, result.num_rows)
        self.assertEqual(6, result.num_cols)
        self.assertListEqual([1, 2, "hi", 1, 2, "hi"], result_matrix[0])
        self.assertListEqual([2, 3, "bye", 2, 3, "bye"], result_matrix[1])
        self.assertEqual(6, len(result.headers))
        self.assertListEqual([
            "h0", "h1", "h2", "identitytransformation_h0",
            "identitytransformation_h1", "identitytransformation_h2"
        ], result.headers)
Пример #18
0
class SklearnModelTest(TestCase):
    def setUp(self):
        data_matrix = [[1, 2, 3, "a"], [2, 3, 2, "b"], [3, 2, 1, "a"],
                       [5, 5, 1, "c"], [2, 2, 2, "a"]]
        self.dataset = Dataset(data_matrix)
        self.settings = AbstractSettings({})
        self.parameter_set = ParameterSet({})
        self.dependent_variable = DatasetVariable(0)
        self.independent_variables = [
            DatasetVariable(1),
            DatasetVariable(2),
            DatasetVariable(3)
        ]

        self.sklearn_model = SklearnModel(self.settings, self.parameter_set,
                                          self.dependent_variable,
                                          self.independent_variables)

    def test_categorical_independent_variables(self):
        independent_variable_data = self.sklearn_model.get_independent_variable_data(
            self.dataset)

        self.assertEqual(5, len(independent_variable_data))
        num_categories = len(independent_variable_data[0])
        self.assertEqual(5, num_categories)

    def test_dependent_variable_data(self):
        dependent_variable_data = self.sklearn_model.get_dependent_variable_data(
            self.dataset)

        self.assertEqual(5, len(dependent_variable_data))
        self.assertListEqual([1, 2, 3, 5, 2], dependent_variable_data)

    def test_training_and_predicting_using_ols_regression(self):
        ols_regression = OLSLinearRegression(self.settings, self.parameter_set,
                                             self.dependent_variable,
                                             self.independent_variables)

        for training_dataset, test_dataset in self.dataset.crossfold_partitions(
                2):
            trained_model = ols_regression.train(training_dataset)
            predicted_result = trained_model.predict(test_dataset)

            self.assertEqual(test_dataset.num_rows, len(predicted_result))
    def test_training_ols_on_simple_dataset(self):
        settings = {}
        parameter_set = {}
        dependent_variable = DatasetVariable(0)
        independent_variables = [DatasetVariable(1)]
        regression = OLSLinearRegression(settings, parameter_set,
                                         dependent_variable,
                                         independent_variables)

        data_matrix = [[1, 1], [2, 2], [3, 3], [4, 4]]
        dataset = Dataset(data_matrix)

        trained = regression.train(dataset)
        array = trained.predict(dataset)

        self.assertEqual(1, array[0])
        self.assertEqual(2, array[1])
        self.assertEqual(3, array[2])
        self.assertEqual(4, array[3])
Пример #20
0
    def read(self, maximum_size=None, delimiter=",", quoting=csv.QUOTE_NONE):
        if maximum_size == None:
            maximum_size = self.settings.get("dataset.maximum_dataset_size")

        with open(self.dataset_filename, 'rb') as f:
            reader = csv.reader(f, delimiter=delimiter, quoting=quoting)
            if self.settings.get("dataset.randomize_file_reader"):
                data_matrix = self.randomized_read_lines(
                    csv_reader, maximum_size)
            else:
                data_matrix = self.greedy_read_lines(reader, maximum_size)

        data_matrix, headers = self.detect_headers(data_matrix)
        data_types = DataTypeClassification.classify_data_matrix(data_matrix)
        data_matrix = DatasetCleaner(self.settings, data_matrix,
                                     headers).clean()
        self.logger.info("Read dataset from file: '%s'", self.dataset_filename)
        self.logger.info("Headers: %s", headers)
        self.logger.info("Dataset Size: %s", len(data_matrix))
        return Dataset(data_matrix, headers=headers, data_types=data_types)
Пример #21
0
    def setUp(self):
        data_matrix = [[1, 2, 3, 4], [2, 3, 4, 5], [3, 4, 5, 6], [0, 1, 3, 4]]
        dataset = Dataset(data_matrix)
        dependent_variable = DatasetVariable(0)
        settings = AbstractSettings({
            "differential_evolution.crossover_probability":
            1.0,
            "differential_evolution.differential_weight":
            1.0,
            "optimization_algorithm.population_size":
            5,
            "independent_variable_selection.initial_independent_variables_percentage":
            1.0
        })

        predictive_model_generator = PredictiveModelGenerator(settings)
        predictive_model_generator.add_model_type(FakePredictiveModel)

        self.optimization_algorithm = OptimizationAlgorithm(
            dataset, dependent_variable, settings, predictive_model_generator)
Пример #22
0
    def merge_datasets(self, datasets):
        if len(datasets) < 1:
            raise ValueError("Must specify at least one dataset to merge.")

        first_dataset = datasets[0]
        num_rows = first_dataset.num_rows
        data_matrix = []
        for i in xrange(num_rows):
            data_matrix.append(list(first_dataset.get_row(i)))

        if first_dataset.headers == None:
            headers = None
        else:
            headers = list(first_dataset.headers)

        for dataset in datasets[1:]:
            headers = self._merge_headers(headers, dataset)
            data_matrix = self._merge_data_matrix(data_matrix, dataset,
                                                  num_rows)
        return Dataset(data_matrix, headers)
Пример #23
0
class SklearnModelTest(TestCase):
    def setUp(self):
        data_matrix = [
                [1,2,3,"a"],
                [2,3,2,"b"],
                [3,2,1,"a"],
                [5,5,1,"c"],
                [2,2,2,"a"]]
        self.dataset = Dataset(data_matrix)
        self.settings = AbstractSettings({})
        self.parameter_set = ParameterSet({})
        self.dependent_variable = DatasetVariable(0)
        self.independent_variables = [DatasetVariable(1), DatasetVariable(2), DatasetVariable(3)]

        self.sklearn_model = SklearnModel(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables)

    def test_categorical_independent_variables(self):
        independent_variable_data = self.sklearn_model.get_independent_variable_data(self.dataset)

        self.assertEqual(5, len(independent_variable_data))
        num_categories = len(independent_variable_data[0])
        self.assertEqual(5, num_categories)

    def test_dependent_variable_data(self):
        dependent_variable_data = self.sklearn_model.get_dependent_variable_data(self.dataset)

        self.assertEqual(5, len(dependent_variable_data))
        self.assertListEqual([1,2,3,5,2], dependent_variable_data)

    def test_training_and_predicting_using_ols_regression(self):
        ols_regression = OLSLinearRegression(self.settings, self.parameter_set, self.dependent_variable, self.independent_variables)

        for training_dataset, test_dataset in self.dataset.crossfold_partitions(2):
            trained_model = ols_regression.train(training_dataset)
            predicted_result = trained_model.predict(test_dataset)

            self.assertEqual(test_dataset.num_rows, len(predicted_result))
Пример #24
0
 def setUp(self):
     data_matrix = [[0, "detective", "book"], [1, "pablo", "escobar"],
                    [2, "african", "queen"], [3, "applause", "movie"]]
     headers = ["number", "string", "another_string"]
     self.dataset = Dataset(data_matrix, headers)
Пример #25
0
class IndependentVariableSelectionTest(TestCase):
    def setUp(self):
        data_matrix = [[1, 2, 3], [2, 3, 4], [5, 6, 7]]
        headers = ["column_0", "column_1", "column_2"]
        self.header_dataset = Dataset(data_matrix, headers)
        self.nonheader_dataset = Dataset(data_matrix)

        self.headered_dependent_variable = DatasetVariable("column_0")
        self.nonheadered_dependent_variable = DatasetVariable(0)

    def test_initializing_small_dataset_with_header(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)

        variables = selection.initialize_independent_variables(2)
        headers = [var.variable for var in variables]
        self.assertIn("column_1", headers)
        self.assertIn("column_2", headers)

    def test_initializaing_small_dataset_without_header(self):
        settings = AbstractSettings()
        independent_variables = self.nonheader_dataset.get_independent_variables(
            self.nonheadered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.nonheadered_dependent_variable,
            independent_variables)

        variables = selection.initialize_independent_variables(2)
        headers = [var.variable for var in variables]
        self.assertIn(1, headers)
        self.assertIn(2, headers)

    def test_getting_probability_of_variables(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)

        self.assertAlmostEqual(
            0.5, selection.get_probability(DatasetVariable("column_1")))
        self.assertAlmostEqual(
            0.5, selection.get_probability(DatasetVariable("column_2")))

        self.assertAlmostEqual(0.5, selection.get_probability("column_1"))
        self.assertAlmostEqual(0.5, selection.get_probability("column_2"))

    def test_increasing_probability_of_variables(self):
        settings = AbstractSettings()
        independent_variables = self.header_dataset.get_independent_variables(
            self.headered_dependent_variable)
        selection = IndependentVariableSelection(
            settings, self.headered_dependent_variable, independent_variables)
        selection.increase_probability(DatasetVariable("column_1"))

        self.assertLess(0.5,
                        selection.get_probability(DatasetVariable("column_1")))
        self.assertGreater(
            0.5, selection.get_probability(DatasetVariable("column_2")))
Пример #26
0
class DatasetTest(TestCase):
    def setUp(self):
        data_matrix = [[0, "detective", "book"], [1, "pablo", "escobar"],
                       [2, "african", "queen"], [3, "applause", "movie"]]
        headers = ["number", "string", "another_string"]
        self.dataset = Dataset(data_matrix, headers)

    def test_data_types_and_categories_when_initializing_dataset(self):
        self.assertEqual(3, len(self.dataset.data_types))
        self.assertEqual("integer", self.dataset.data_types[0].data_type)
        self.assertEqual("string", self.dataset.data_types[1].data_type)
        self.assertEqual("string", self.dataset.data_types[2].data_type)

        self.assertEqual(4, len(self.dataset.data_types[1].categories))
        self.assertTrue(self.dataset.data_types[1].in_categories("detective"))
        self.assertTrue(self.dataset.data_types[1].in_categories("pablo"))
        self.assertTrue(self.dataset.data_types[1].in_categories("african"))
        self.assertTrue(self.dataset.data_types[1].in_categories("applause"))

        self.assertEqual(4, len(self.dataset.data_types[2].categories))
        self.assertTrue(self.dataset.data_types[2].in_categories("book"))
        self.assertTrue(self.dataset.data_types[2].in_categories("escobar"))
        self.assertTrue(self.dataset.data_types[2].in_categories("queen"))
        self.assertTrue(self.dataset.data_types[2].in_categories("movie"))

    def test_getting_independent_variables(self):
        dependent_variable_index = DatasetVariable(0)
        dependent_variable_header = DatasetVariable("number")

        independent_variables = self.dataset.get_independent_variables(
            dependent_variable_index)
        self.assertEqual(2, len(independent_variables))
        for variable in independent_variables:
            self.assertIn(variable.variable, ["string", "another_string"])

        independent_variables = self.dataset.get_independent_variables(
            dependent_variable_header)
        self.assertEqual(2, len(independent_variables))
        for variable in independent_variables:
            self.assertIn(variable.variable, ["string", "another_string"])

    def test_crossfold_partitions(self):
        past_test_datasets = []
        for training_dataset, test_dataset in self.dataset.crossfold_partitions(
                4):
            self.assertEqual(3, training_dataset.num_rows)
            self.assertEqual(1, test_dataset.num_rows)
            self.assertNotIn(test_dataset.get(0, 0), past_test_datasets)
            self.assertListEqual(["number", "string", "another_string"],
                                 training_dataset.headers)
            self.assertListEqual(["number", "string", "another_string"],
                                 test_dataset.headers)

            past_test_datasets.append(test_dataset.get(0, 0))

        for training_dataset, test_dataset in self.dataset.crossfold_partitions(
                2):
            self.assertEqual(2, training_dataset.num_rows)
            self.assertEqual(2, test_dataset.num_rows)
            self.assertListEqual(["number", "string", "another_string"],
                                 training_dataset.headers)
            self.assertListEqual(["number", "string", "another_string"],
                                 test_dataset.headers)

    def test_get_filtered_matrix(self):
        variables = [DatasetVariable(0), DatasetVariable("string")]

        filtered_matrix = self.dataset.get_filtered_matrix(variables)
        self.assertEqual(4, len(filtered_matrix))
        for i in xrange(len(filtered_matrix)):
            self.assertEqual(2, len(filtered_matrix[i]))

        self.assertListEqual([0, "detective"], filtered_matrix[0])
        self.assertListEqual([1, "pablo"], filtered_matrix[1])
        self.assertListEqual([2, "african"], filtered_matrix[2])
        self.assertListEqual([3, "applause"], filtered_matrix[3])

    def test_getting_filtered_matrix_with_no_variables(self):
        variables = []

        with self.assertRaises(ValueError):
            filtered_matrix = self.dataset.get_filtered_matrix(variables)