示例#1
0
 def test_split_labelled(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], 
                       labels=["b", "b", "b", "a"])
     first, second = dataset.split(0.5)
     self.assertTrue(first.is_labelled())
     assert_that(first.get_labels(), equals_series({0: "b", 1: "b"}))
     self.assertTrue(second.is_labelled())
     assert_that(second.get_labels(), equals_series({2: "b", 3: "a"}))
示例#2
0
 def test_copy(self):
     dataset1 = DataSet([[1, 2], [3, 4]], labels=pd.Series(["a", "b"]))
     dataset2 = dataset1.copy()
     dataset2.set_column(1, pd.Series([4, 5]))
     
     assert_that(dataset2, equals_dataset([[1, 4], [3, 5]]))
     assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
     assert_that(dataset1, equals_dataset([[1, 2], [3, 4]]))
     assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
示例#3
0
 def test_get_feature_value_counts(self):
     df = pd.DataFrame([["Jim", 19, 180], ["John", 18, 177], 
                        ["Jack", 19, 185]], 
                       columns=["name", "age", "height"])
     dataset = DataSet(df)
     
     age_value_counts = dataset.get_feature_value_counts("age")
     assert_that(age_value_counts, equals_series({19: 2, 18: 1}))
     
     name_value_counts = dataset.get_feature_value_counts("name")
     assert_that(name_value_counts, 
                 equals_series({"Jim": 1, "John": 1, "Jack": 1}))
示例#4
0
 def test_compute_iteration(self):
     dataset = DataSet([[1, 5], [2, 1], [6, 5]])
     centroids = [pd.Series([4, 5]), pd.Series([6, 2])]
     
     new_cent, clusters = clustering._compute_iteration(dataset, centroids,
                                                        euclidean)
     expected_cent = [{0: 3.5, 1: 5}, {0: 2, 1: 1}]
     
     self.assertEqual(len(new_cent), len(expected_cent))
     for i, cent in enumerate(new_cent):
         assert_that(cent, equals_series(expected_cent[i]))
     
     assert_that(clusters, equals_series({0: 0, 1: 1, 2: 0}))
示例#5
0
    def test_gradient_descent_3_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"),
                              has_ids=False,
                              has_header=False,
                              has_labels=True,
                              delimiter=",")
        dataset.normalize_features()
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 1.0
        iter = 50

        initial_theta = pd.Series({0: 0, 1: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset,
                                          initial_theta,
                                          learning_rate,
                                          iterations=iter)

        assert_that(
            theta,
            equals_series(
                {
                    0: 110631.050279,
                    1: -6649.474271,
                    "bias": 340412.659574
                },
                places=6))
示例#6
0
 def test_get_label_value_counts(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]], 
                       labels=["a", "b", "b", "c", "a", "b"])
     expected = {"a": 2, "b": 3, "c": 1}
     value_counts = dataset.get_label_value_counts()
     assert_that(value_counts, equals_series(expected))
     assert_that(value_counts.index, contains("b", "a", "c"))
示例#7
0
    def test_gradient_descent_2_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.

        For population = 35,000, we predict a profit of 4519.767868
        For population = 70,000, we predict a profit of 45342.450129

        Final cost: 4.483388
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"),
                              has_ids=False, has_header=False, has_labels=True,
                              delimiter=",")
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 0.01
        iter = 100

        initial_theta = pd.Series({0: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset, initial_theta,
                                          learning_rate, iterations=iter)

        # assert_that(theta.tolist(), contains(-0.576556, 0.859582))
        assert_that(theta, equals_series({0: 0.859582,
                                         "bias": -0.576556},
                                         places=6))
示例#8
0
    def test_gradient_descent_2_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.

        For population = 35,000, we predict a profit of 4519.767868
        For population = 70,000, we predict a profit of 45342.450129

        Final cost: 4.483388
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"),
                              has_ids=False,
                              has_header=False,
                              has_labels=True,
                              delimiter=",")
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 0.01
        iter = 100

        initial_theta = pd.Series({0: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset,
                                          initial_theta,
                                          learning_rate,
                                          iterations=iter)

        # assert_that(theta.tolist(), contains(-0.576556, 0.859582))
        assert_that(theta,
                    equals_series({
                        0: 0.859582,
                        "bias": -0.576556
                    }, places=6))
示例#9
0
 def test_filter_by_feature_value_with_labels(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], 
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df, labels=["SENG", "SENG", "CENG"])
     filtered = dataset.value_filter("hair colour", "brown")
     assert_that(filtered.get_labels(), 
                 equals_series({0: "SENG", 2: "CENG"}))
示例#10
0
 def test_classify_all(self):
     training_set, sample_0 = self.load_car_data()
     sample_1 = {"color": "yellow", "type": "sports", "origin": "domestic"}
     dataset = DataSet(pd.DataFrame([sample_0, sample_1]), 
                       labels=["no", "yes"])
     classifier = NaiveBayes(training_set)
     results = classifier.classify_all(dataset)
     assert_that(results.get_classifications(), equals_series({0: "no", 1: "no"}))
     self.assertEqual(results.compute_accuracy(), 0.5)
示例#11
0
 def test_get_labelled_rows(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], 
                       labels=["a", "a", "b", "b"])
     selection = dataset.get_rows([1, 3])
     
     self.assertEqual(selection.num_samples(), 2)
     self.assertTrue(selection.is_labelled())
     # TODO incorporate labels equals_series into DataSet matcher?
     assert_that(selection, equals_dataset([[3, 4], [7, 8]]))
     assert_that(selection.get_labels(), equals_series({1: "a", 3: "b"}))
 def test_classify_all_weekends(self):
     training = load(self.relative_to_base("/datasets/weekends.data"))
     classifier = DecisionTree(training)
     index = ['weather', 'parents', 'money']
     sample_0 = pd.Series(["windy", "no", "rich"], index=index)
     sample_1 = pd.Series(["sunny", "yes", "rich"], index=index)
     results = classifier.classify_all(
                     DataSet(pd.DataFrame([sample_0, sample_1])))
     assert_that(results.get_classifications(), 
                 equals_series({0: "shopping", 1: "cinema"}))
示例#13
0
    def test_kmeans_k_3(self):
        dataset = DataSet([[3, 13], [5, 13], [2, 11], [4, 11], [6, 11], 
                           [8, 5], [5, 3], [6, 2], [9, 2], [16, 14], [18, 13], 
                           [16, 11], [19, 10]])
        preset_centroids = [pd.Series([4, 9]), pd.Series([10, 6]), pd.Series([17, 9])]

        clustered = clustering.kmeans(dataset, k=3, centroids=preset_centroids)
        assert_that(clustered.get_cluster_assignments(), equals_series({0: 0, 
                                            1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 
                                            6: 1, 7: 1, 8: 1, 9: 2, 10: 2, 
                                            11: 2, 12: 2}))
示例#14
0
 def test_filter_by_multiple_labels(self):
     features = ["name", "hair colour"]
     df = pd.DataFrame([["Will", "black"], ["Rob", "blonde"],
                        ["Bill", "brown"], ["Bob", "black"],
                        ["Jim", "brown"]], columns=features)
     dataset = DataSet(df, labels=["ELEC", "SENG", "ELEC", "CENG", "SENG"])
     filtered = dataset.label_filter(["SENG", "CENG"])
     assert_that(filtered, equals_dataset([["Rob", "blonde"],
                                           ["Bob", "black"],
                                           ["Jim", "brown"]]))
     assert_that(filtered.get_labels(),
                 equals_series({1: "SENG", 3: "CENG", 4: "SENG"}))
示例#15
0
    def test_gradient_descent_3_parameters(self):
        """
        Test based on Assignment 1 of the free online
        Stanford Machine Learning online course.
        """
        dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"),
                              has_ids=False, has_header=False, has_labels=True,
                              delimiter=",")
        dataset.normalize_features()
        dataset.set_column("bias", pd.Series([1] * dataset.num_samples()))

        learning_rate = 1.0
        iter = 50

        initial_theta = pd.Series({0: 0, 1: 0, "bias": 0})
        theta = optimize.gradient_descent(dataset, initial_theta,
                                          learning_rate, iterations=iter)

        assert_that(theta, equals_series({0: 110631.050279,
                                          1: -6649.474271,
                                         "bias": 340412.659574},
                                         places=6))
示例#16
0
 def test_load_labelled(self):
     dataset = load(self.relative_to_base("datasets/3f_ids_header.csv"))
     self.assertTrue(dataset.is_labelled())
     labels = dataset.get_labels()
     assert_that(labels, equals_series({"V01": "c", "V02": "b", "V03": "b", 
                                        "V04": "a"}))
示例#17
0
 def test_get_label_value_counts_no_labels(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
     assert_that(dataset.get_label_value_counts(), equals_series({}))
示例#18
0
 def test_combine_labels(self):
     dataset = DataSet([[1, 2], [3, 4], [5, 6]], 
                       labels=pd.Series(["cat", "crow", "pidgeon"]))
     dataset.combine_labels(["crow", "pidgeon"], "bird")
     labels = dataset.get_labels()
     assert_that(labels, equals_series({0: "cat", 1: "bird", 2: "bird"}))