def test_split_labelled(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], labels=["b", "b", "b", "a"]) first, second = dataset.split(0.5) self.assertTrue(first.is_labelled()) assert_that(first.get_labels(), equals_series({0: "b", 1: "b"})) self.assertTrue(second.is_labelled()) assert_that(second.get_labels(), equals_series({2: "b", 3: "a"}))
def test_copy(self): dataset1 = DataSet([[1, 2], [3, 4]], labels=pd.Series(["a", "b"])) dataset2 = dataset1.copy() dataset2.set_column(1, pd.Series([4, 5])) assert_that(dataset2, equals_dataset([[1, 4], [3, 5]])) assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"})) assert_that(dataset1, equals_dataset([[1, 2], [3, 4]])) assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
def test_get_feature_value_counts(self): df = pd.DataFrame([["Jim", 19, 180], ["John", 18, 177], ["Jack", 19, 185]], columns=["name", "age", "height"]) dataset = DataSet(df) age_value_counts = dataset.get_feature_value_counts("age") assert_that(age_value_counts, equals_series({19: 2, 18: 1})) name_value_counts = dataset.get_feature_value_counts("name") assert_that(name_value_counts, equals_series({"Jim": 1, "John": 1, "Jack": 1}))
def test_compute_iteration(self): dataset = DataSet([[1, 5], [2, 1], [6, 5]]) centroids = [pd.Series([4, 5]), pd.Series([6, 2])] new_cent, clusters = clustering._compute_iteration(dataset, centroids, euclidean) expected_cent = [{0: 3.5, 1: 5}, {0: 2, 1: 1}] self.assertEqual(len(new_cent), len(expected_cent)) for i, cent in enumerate(new_cent): assert_that(cent, equals_series(expected_cent[i])) assert_that(clusters, equals_series({0: 0, 1: 1, 2: 0}))
def test_gradient_descent_3_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. """ dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.normalize_features() dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 1.0 iter = 50 initial_theta = pd.Series({0: 0, 1: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) assert_that( theta, equals_series( { 0: 110631.050279, 1: -6649.474271, "bias": 340412.659574 }, places=6))
def test_get_label_value_counts(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]], labels=["a", "b", "b", "c", "a", "b"]) expected = {"a": 2, "b": 3, "c": 1} value_counts = dataset.get_label_value_counts() assert_that(value_counts, equals_series(expected)) assert_that(value_counts.index, contains("b", "a", "c"))
def test_gradient_descent_2_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. For population = 35,000, we predict a profit of 4519.767868 For population = 70,000, we predict a profit of 45342.450129 Final cost: 4.483388 """ dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 0.01 iter = 100 initial_theta = pd.Series({0: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) # assert_that(theta.tolist(), contains(-0.576556, 0.859582)) assert_that(theta, equals_series({0: 0.859582, "bias": -0.576556}, places=6))
def test_gradient_descent_2_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. For population = 35,000, we predict a profit of 4519.767868 For population = 70,000, we predict a profit of 45342.450129 Final cost: 4.483388 """ dataset = loader.load(self.relative_to_base("datasets/ex1data1.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 0.01 iter = 100 initial_theta = pd.Series({0: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) # assert_that(theta.tolist(), contains(-0.576556, 0.859582)) assert_that(theta, equals_series({ 0: 0.859582, "bias": -0.576556 }, places=6))
def test_filter_by_feature_value_with_labels(self): features = ["name", "hair colour"] df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df, labels=["SENG", "SENG", "CENG"]) filtered = dataset.value_filter("hair colour", "brown") assert_that(filtered.get_labels(), equals_series({0: "SENG", 2: "CENG"}))
def test_classify_all(self): training_set, sample_0 = self.load_car_data() sample_1 = {"color": "yellow", "type": "sports", "origin": "domestic"} dataset = DataSet(pd.DataFrame([sample_0, sample_1]), labels=["no", "yes"]) classifier = NaiveBayes(training_set) results = classifier.classify_all(dataset) assert_that(results.get_classifications(), equals_series({0: "no", 1: "no"})) self.assertEqual(results.compute_accuracy(), 0.5)
def test_get_labelled_rows(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], labels=["a", "a", "b", "b"]) selection = dataset.get_rows([1, 3]) self.assertEqual(selection.num_samples(), 2) self.assertTrue(selection.is_labelled()) # TODO incorporate labels equals_series into DataSet matcher? assert_that(selection, equals_dataset([[3, 4], [7, 8]])) assert_that(selection.get_labels(), equals_series({1: "a", 3: "b"}))
def test_classify_all_weekends(self): training = load(self.relative_to_base("/datasets/weekends.data")) classifier = DecisionTree(training) index = ['weather', 'parents', 'money'] sample_0 = pd.Series(["windy", "no", "rich"], index=index) sample_1 = pd.Series(["sunny", "yes", "rich"], index=index) results = classifier.classify_all( DataSet(pd.DataFrame([sample_0, sample_1]))) assert_that(results.get_classifications(), equals_series({0: "shopping", 1: "cinema"}))
def test_kmeans_k_3(self): dataset = DataSet([[3, 13], [5, 13], [2, 11], [4, 11], [6, 11], [8, 5], [5, 3], [6, 2], [9, 2], [16, 14], [18, 13], [16, 11], [19, 10]]) preset_centroids = [pd.Series([4, 9]), pd.Series([10, 6]), pd.Series([17, 9])] clustered = clustering.kmeans(dataset, k=3, centroids=preset_centroids) assert_that(clustered.get_cluster_assignments(), equals_series({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 6: 1, 7: 1, 8: 1, 9: 2, 10: 2, 11: 2, 12: 2}))
def test_filter_by_multiple_labels(self): features = ["name", "hair colour"] df = pd.DataFrame([["Will", "black"], ["Rob", "blonde"], ["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df, labels=["ELEC", "SENG", "ELEC", "CENG", "SENG"]) filtered = dataset.label_filter(["SENG", "CENG"]) assert_that(filtered, equals_dataset([["Rob", "blonde"], ["Bob", "black"], ["Jim", "brown"]])) assert_that(filtered.get_labels(), equals_series({1: "SENG", 3: "CENG", 4: "SENG"}))
def test_gradient_descent_3_parameters(self): """ Test based on Assignment 1 of the free online Stanford Machine Learning online course. """ dataset = loader.load(self.relative_to_base("datasets/ex1data2.txt"), has_ids=False, has_header=False, has_labels=True, delimiter=",") dataset.normalize_features() dataset.set_column("bias", pd.Series([1] * dataset.num_samples())) learning_rate = 1.0 iter = 50 initial_theta = pd.Series({0: 0, 1: 0, "bias": 0}) theta = optimize.gradient_descent(dataset, initial_theta, learning_rate, iterations=iter) assert_that(theta, equals_series({0: 110631.050279, 1: -6649.474271, "bias": 340412.659574}, places=6))
def test_load_labelled(self): dataset = load(self.relative_to_base("datasets/3f_ids_header.csv")) self.assertTrue(dataset.is_labelled()) labels = dataset.get_labels() assert_that(labels, equals_series({"V01": "c", "V02": "b", "V03": "b", "V04": "a"}))
def test_get_label_value_counts_no_labels(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) assert_that(dataset.get_label_value_counts(), equals_series({}))
def test_combine_labels(self): dataset = DataSet([[1, 2], [3, 4], [5, 6]], labels=pd.Series(["cat", "crow", "pidgeon"])) dataset.combine_labels(["crow", "pidgeon"], "bird") labels = dataset.get_labels() assert_that(labels, equals_series({0: "cat", 1: "bird", 2: "bird"}))