def test_has_missing_values(self): dataset1 = DataSet([[4.2, np.NaN, 3.1], [2.5, 1.9, np.NaN], [1.1, 1.2, 1.7]]) self.assertTrue(dataset1.has_missing_values()) dataset2 = DataSet([[4.2, 3.9, 3.1], [2.5, 1.9, 2.2], [1.1, 1.2, 1.7]]) self.assertFalse(dataset2.has_missing_values())
def test_split_0(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) first, second = dataset.split(0) self.assertEqual(first.num_samples(), 0) assert_that(first, equals_dataset([])) self.assertEqual(second.num_samples(), 4) assert_that(second, equals_dataset([[1, 2], [3, 4], [5, 6], [7, 8]]))
def test_contruct_dataset_from_dataset(self): original = DataSet([[1, 2], [3, 4], [5, 6]]) new = DataSet(original) self.assertFalse(new is original) new._dataframe.ix[1] = 1 assert_that(new, equals_dataset([[1, 2], [1, 1], [5, 6]])) assert_that(original, equals_dataset([[1, 2], [3, 4], [5, 6]]))
def test_get_row(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) row = dataset.get_row(1) assert_that(row.values, contains(3, 4)) # check that changes made to selected row are reflected in original row[:] = 1 assert_that(dataset.get_row(1), contains(1, 1))
def test_split_random(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) first, second = dataset.split(0.5, random=True) # since the split is random, can't assert that first or second # contain particular rows, just the number of rows self.assertEqual(first.num_samples(), 2) self.assertEqual(second.num_samples(), 2)
def test_get_label_value_counts(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]], labels=["a", "b", "b", "c", "a", "b"]) expected = {"a": 2, "b": 3, "c": 1} value_counts = dataset.get_label_value_counts() assert_that(value_counts, equals_series(expected)) assert_that(value_counts.index, contains("b", "a", "c"))
def test_unequal_split(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]]) first, second = dataset.split(0.3) self.assertEqual(first.num_samples(), 1) assert_that(first, equals_dataset([[1, 2]])) self.assertEqual(second.num_samples(), 3) assert_that(second, equals_dataset([[3, 4], [5, 6], [7, 8]]))
def test_split_labelled(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], labels=["b", "b", "b", "a"]) first, second = dataset.split(0.5) self.assertTrue(first.is_labelled()) assert_that(first.get_labels(), equals_series({0: "b", 1: "b"})) self.assertTrue(second.is_labelled()) assert_that(second.get_labels(), equals_series({2: "b", 3: "a"}))
def test_filter_by_feature_value_with_labels(self): features = ["name", "hair colour"] df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df, labels=["SENG", "SENG", "CENG"]) filtered = dataset.value_filter("hair colour", "brown") assert_that(filtered.get_labels(), equals_series({0: "SENG", 2: "CENG"}))
def test_bin_feature(self): df = pd.DataFrame([[0, 1], [7, 2], [6, 3]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("MATH100", [4, 7]) assert_that(dataset, equals_dataset([[0, 1], [2, 2], [1, 3]]))
def test_slice_features_list_indices(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) labels = ["m", "f", "m"] dataset = DataSet(df, labels=labels) sliced = dataset.slice_features([1, 2]) assert_that(sliced, equals_dataset([[2, 3], [5, 6], [8, 9]])) assert_that(sliced.feature_list(), contains(1, 2)) assert_that(sliced.get_labels(), contains(*labels))
def test_get_row_by_id(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["V01", "V02", "V03"]) dataset = DataSet(df) sample = dataset.get_row("V02") assert_that(sample, contains(4, 5, 6)) # make sure position based index is still usable sample = dataset.get_row(1) assert_that(sample, contains(4, 5, 6))
def test_bin_all(self): df = pd.DataFrame([[0, 6], [9, 2], [6, 4]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("*", [4, 7], bin_names=["low", "mid", "high"]) assert_that(dataset, equals_dataset([["low", "mid"], ["high", "low"], ["mid", "mid"]]))
def test_get_labelled_data_frame(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], labels=pd.Series(["b", "b", "b", "a"])) df = dataset.get_labelled_data_frame() # TODO: non-numeric values in DataFrame matcher expected = [[1, 2, "b"], [3, 4, "b"], [5, 6, "b"], [7, 8, "a"]] for i in range(len(expected)): self.assertTrue(df.ix[i].tolist(), expected[i])
def test_copy(self): dataset1 = DataSet([[1, 2], [3, 4]], labels=pd.Series(["a", "b"])) dataset2 = dataset1.copy() dataset2.set_column(1, pd.Series([4, 5])) assert_that(dataset2, equals_dataset([[1, 4], [3, 5]])) assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"})) assert_that(dataset1, equals_dataset([[1, 2], [3, 4]])) assert_that(dataset2.get_labels(), equals_series({0: "a", 1: "b"}))
def test_bin_feature_1_boundary(self): df = pd.DataFrame([[0, 1], [9, 2], [6, 3]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("MATH100", [3], bin_names=["low", "high"]) assert_that(dataset, equals_dataset([["low", 1], ["high", 2], ["high", 3]]))
def test_bin_feature_floats(self): df = pd.DataFrame([[3.5, 1], [9.1, 2], [6.2, 3]], columns=["MATH100", "PHYS125"]) dataset = DataSet(df) dataset.bin("MATH100", [3.9, 7], bin_names=["low", "mid", "high"]) assert_that(dataset, equals_dataset([["low", 1], ["high", 2], ["mid", 3]]))
def test_copy_no_labels(self): dataset1 = DataSet([[1, 2], [3, 4]]) dataset2 = dataset1.copy() dataset2.set_column(1, pd.Series([4, 5])) assert_that(dataset2, equals_dataset([[1, 4], [3, 5]])) self.assertFalse(dataset2.is_labelled()) assert_that(dataset1, equals_dataset([[1, 2], [3, 4]])) self.assertFalse(dataset1.is_labelled())
def test_drop_empty_samples(self): df = pd.DataFrame([[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]]) original = DataSet(df, labels=["a", "b", "c"]) filtered = original.drop_empty_samples() assert_that(filtered.feature_list(), has_length(3)) assert_that(filtered.num_samples(), equal_to(2)) assert_that(filtered, equals_dataset([[1, 2, np.NAN], [7, 8, 9]])) assert_that(filtered.get_labels(), contains("a", "c"))
def test_slice_features_list_string(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["weight", "height", "age"]) labels = ["m", "f", "m"] dataset = DataSet(df, labels=labels) sliced = dataset.slice_features(["weight", "height"]) assert_that(sliced, equals_dataset([[1, 2], [4, 5], [7, 8]])) assert_that(sliced.feature_list(), contains("weight", "height")) assert_that(sliced.get_labels(), contains(*labels))
def test_drop_column(self): original = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) self.assertEqual(original.num_features(), 3) filtered = original.drop_column(1) self.assertEqual(filtered.num_features(), 2) assert_that(filtered, equals_dataset([[1, 3], [4, 6], [7, 9]])) # make sure original unchanged self.assertEqual(original.num_features(), 3) assert_that(original, equals_dataset([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
def test_get_labelled_rows(self): dataset = DataSet([[1, 2], [3, 4], [5, 6], [7, 8]], labels=["a", "a", "b", "b"]) selection = dataset.get_rows([1, 3]) self.assertEqual(selection.num_samples(), 2) self.assertTrue(selection.is_labelled()) # TODO incorporate labels equals_series into DataSet matcher? assert_that(selection, equals_dataset([[3, 4], [7, 8]])) assert_that(selection.get_labels(), equals_series({1: "a", 3: "b"}))
def test_filter_by_feature_value(self): features = ["name", "hair colour"] df = pd.DataFrame([["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df) filtered = dataset.value_filter("hair colour", "brown") self.assertEqual(filtered.feature_list(), features) assert_that(filtered.get_sample_ids(), contains(0, 2)) assert_that(filtered, equals_dataset([["Bill", "brown"], ["Jim", "brown"]]))
def test_to_string(self): df = pd.DataFrame([[4.2, np.NaN, 3.1], [2.5, 1.9, np.NaN], [1.1, 1.2, 1.7]], columns=["weight", "height", "length"]) dataset = DataSet(df, labels=["cat", "bird", "bat"]) expected = "\n".join(("Features: ['weight', 'height', 'length']", "Samples: 3", "Missing values? yes", "Labelled? yes")) self.assertEqual(expected, dataset.__repr__())
def test_drop_empty_samples_original_unchanged(self): data_list = [[1, 2, np.NAN], [np.NAN, np.NAN, np.NAN], [7, 8, 9]] label_list = ["a", "b", "c"] original = DataSet(pd.DataFrame(data_list), labels=label_list) filtered = original.drop_empty_samples() filtered.set_column(0, [-1, -1]) filtered.labels[0] = "z" assert_that(original, equals_dataset(data_list)) assert_that(original.get_labels(), contains(*label_list))
def test_get_feature_value_counts(self): df = pd.DataFrame([["Jim", 19, 180], ["John", 18, 177], ["Jack", 19, 185]], columns=["name", "age", "height"]) dataset = DataSet(df) age_value_counts = dataset.get_feature_value_counts("age") assert_that(age_value_counts, equals_series({19: 2, 18: 1})) name_value_counts = dataset.get_feature_value_counts("name") assert_that(name_value_counts, equals_series({"Jim": 1, "John": 1, "Jack": 1}))
def test_filter_by_multiple_labels(self): features = ["name", "hair colour"] df = pd.DataFrame([["Will", "black"], ["Rob", "blonde"], ["Bill", "brown"], ["Bob", "black"], ["Jim", "brown"]], columns=features) dataset = DataSet(df, labels=["ELEC", "SENG", "ELEC", "CENG", "SENG"]) filtered = dataset.label_filter(["SENG", "CENG"]) assert_that(filtered, equals_dataset([["Rob", "blonde"], ["Bob", "black"], ["Jim", "brown"]])) assert_that(filtered.get_labels(), equals_series({1: "SENG", 3: "CENG", 4: "SENG"}))
def test_get_values(self): df = pd.DataFrame([["Jim", 19, 180], ["John", 18, 177], ["Jack", 19, 185]], columns=["name", "age", "height"]) dataset = DataSet(df) age_values = dataset.get_feature_values("age") self.assertEqual(len(age_values), 2) self.assertTrue(19 in age_values) self.assertTrue(18 in age_values) height_values = dataset.get_feature_values("height") self.assertTrue(180 in height_values) self.assertTrue(185 in height_values) self.assertTrue(177 in height_values)
def test_as_dataset(self): original = DataSet([[1, 2], [3, 4], [5, 6]]) dataset = as_dataset(original) self.assertTrue(dataset is original) dataset._dataframe.ix[1] = 1 assert_that(dataset, equals_dataset([[1, 2], [1, 1], [5, 6]])) assert_that(original, equals_dataset([[1, 2], [1, 1], [5, 6]]))
def test_kmeans_k_3(self): dataset = DataSet([[3, 13], [5, 13], [2, 11], [4, 11], [6, 11], [8, 5], [5, 3], [6, 2], [9, 2], [16, 14], [18, 13], [16, 11], [19, 10]]) preset_centroids = [ pd.Series([4, 9]), pd.Series([10, 6]), pd.Series([17, 9]) ] clustered = clustering.kmeans(dataset, k=3, centroids=preset_centroids) assert_that( clustered.get_cluster_assignments(), equals_series({ 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 6: 1, 7: 1, 8: 1, 9: 2, 10: 2, 11: 2, 12: 2 }))
def test_classify_all(self): training_set = DataSet([[1, 1], [2, 2], [11, 11], [12, 12]], labels=["a", "a", "b", "b"]) classifier = Knn(training_set, k=3) dataset = [[1.5, 1.3], [12.2, 12.9]] classes = classifier.classify_all(dataset).get_classifications() assert_that(classes, contains("a", "b"))
def test_slice_features_original_unchanged(self): df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["weight", "height", "age"]) labels = ["m", "f", "m"] dataset = DataSet(df, labels=labels) sliced = dataset.slice_features(["weight", "height"]) # Modify sliced data sliced.set_column("weight", [0, 0, 0]) sliced.labels[0] = "x" # Check that it was indeed changed assert_that(sliced.get_column("weight"), contains(0, 0, 0)) assert_that(sliced.get_labels(), contains("x", "f", "m")) # Verify it was not changed in the original dataset assert_that(dataset.get_column("weight"), contains(1, 4, 7)) assert_that(dataset.get_labels(), contains(*labels))
def test_get_column(self): dataset = DataSet([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) column1 = dataset.get_column(1) assert_that(column1.values, contains(2, 5, 8))
def test_reduce_features(self): dataset = DataSet([[4, 9, 8], [2, 1, 7], [5, 6, 1]]) reduced = dataset.reduce_features(min) assert_that(reduced.values, contains(2, 1, 1))