def test8_reponse(): test_data = np.array([[2, 1, 3], [3, 1, 4], [2, 1, 5], [2, 1, 6], [2, 1, 17]]) test_data1 = np.array([[2, 1, 0], [3, 1, 4], [2, 1, 0], [2, 1, 0], [2, 1, 0]]) test_data2 = np.array([[2, 1, 1], [3, 1, 4], [2, 1, 0], [2, 1, 0], [2, 1, 0]]) test_data3 = np.array([[2, 1, 3], [3, 1, 3], [2, 1, 3], [2, 1, 3], [2, 1, 3]]) def Gini_index(p): return p*(1-p) with pytest.raises(ValueError): classification_tree.Node(test_data, Gini_index, []) with pytest.raises(ValueError): classification_tree.Node(test_data1, Gini_index, []) with pytest.raises(ValueError): classification_tree.Node(test_data2, Gini_index, []) with pytest.raises(ValueError): classification_tree.Node(test_data3, Gini_index, [])
def test_same_respond(): """ This function is to test how the tree handle the same y values. """ test_case = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list('ABCD')) test_case["Y"] = [0] * 100 tree = classification_tree.Node(test_case, 'Y', classification_tree.gini) assert tree.left_child is None and tree.right_child is None,\ "The tree should only have one node." test_case["Y"] = [1] * 100 tree = classification_tree.Node(test_case, 'Y', classification_tree.gini) assert tree.left_child is None and tree.right_child is None,\ "The tree should only have one node."
def import_format_tests_x(): x_data = [[0, "the"], [0, 1], [2, 3]] x_df = pd.DataFrame(x_data, columns=['var1', 'var2']) y_data = [0, 1, 0] y_df = pd.DataFrame(y_data, columns=['y']) with pytest.raises(TypeError): ct.Node(x_df, y_df)
def test_potential_splits(): x_train = [[1, 2, 3], [4, 5, 6], [6, 7, 8], [1, 1, 2], [-1, 4, 5], [0, 0, 1]] y_train = [0, 1, 1, 0, 1, 1] x_df = pd.DataFrame(x_train, columns=['v1', 'v2', 'v3']) y_df = pd.DataFrame(y_train, columns=['y']) n = ct.Node(x_df, y_df) assert n.potential_splits('v1') == [0, 1, 4, 6, -1]
def test_best_split(): x_train = [[1, 2, 3], [4, 5, 6], [6, 7, 8], [1, 1, 2], [-1, 4, 5], [0, 0, 1]] y_train = [0, 1, 1, 0, 1, 1] x_df = pd.DataFrame(x_train, columns=['v1', 'v2', 'v3']) y_df = pd.DataFrame(y_train, columns=['y']) n = ct.Node(x_df, y_df) assert n.best_split() == (2, 'v2')
def test9(): # making sure dataframe and SQL data base produce the same tree def Gini_index(p): return p*(1-p) test_data2 = np.array([[5, 2, 34, 0], [38, 20, 21, 0], [72, 20, 10, 0], [52, 20, 15, 1]]) node_test = classification_tree.Node(test_data2, Gini_index, []) node_test2 = sql_tree.SQLTree(cur, 'test_data2', ['column_1', 'column_2', 'column_3'], 'response', Gini_index, []) assert node_test.past_split == node_test2.past_split
def test3(): # missing values # If there is missing values inside the dataset, throw an error. test_data = np.array([[2, None, 0], [3, 1, 0], [2, 1, 0], [2, 1, 0], [2, 1, 1]]) def Gini_index(p): return p*(1-p) with pytest.raises(ValueError): classification_tree.Node(test_data, Gini_index, [])
def test_impurity_reduction(): x_train = [[1, 2, 3], [4, 5, 6], [6, 7, 8], [1, 1, 2], [-1, 4, 5], [0, 0, 1]] y_train = [0, 1, 1, 0, 1, 1] x_df = pd.DataFrame(x_train, columns=['v1', 'v2', 'v3']) y_df = pd.DataFrame(y_train, columns=['y']) n = ct.Node(x_df, y_df) assert float(round(n.impurity_reduction('v2', 3), 2)) == 0.17
def test6_R2(): test_data = np.array([[5, 34, 1], [38, 7, 1], [72, 15, 0], [52, 29, 1]]) def Gini_index(p): return p*(1-p) node_test = classification_tree.Node(test_data, Gini_index, []) assert node_test.split_variable == 0 assert node_test.split_point == 72 assert node_test.left.split_variable == None assert node_test.right.split_variable == None
def test_same_value(): """ This function is to test how the program handle the same age value. """ data = [[1, 10, 30, 1], [2, 10, 100, 0], [3, 10, 10, 1], [4, 10, 15, 1]] test_case = pd.DataFrame(data, columns=['Name', 'Age', 'Ticket Fee', 'Y']) tree = classification_tree.Node(test_case, 'Y', classification_tree.gini) assert tree.split_variable == "Ticket Fee",\ " The Age column shoud be ignored"
def test_wrong_input_2(): """ This test is to ensure the import format is as expected, by having a not binary y value. """ data = [[1, 10, 30, 1], [2, 10, 100, 3], [3, 10, 10, 1], [4, 10, 15, 1]] test_case = pd.DataFrame(data, columns=['Name', 'Age', 'Ticket Fee', 'Y']) with pytest.raises(AssertionError): classification_tree.Node(test_case, 'Y', classification_tree.gini)
def test_wrong_input_1(): """ This function is a test to ensure the import format is as expected """ data = [["Alice", 10, 30, 1], ["Ben", 10, 100, 0], ["Charles", 10, 10, 1], ["Don", 10, 15, 1]] test_case = pd.DataFrame(data, columns=['Name', 'Age', 'Ticket Fee', 'Y']) with pytest.raises(Exception): classification_tree.Node(test_case, 'Y', classification_tree.gini)
def test(): # if responses are all 0 or all 1. #Making sure the tree ends there. test_data = np.array([[25,36,78,0], [10, 20, 15, 0], [99, 50, 4, 0]]) def Gini_index(p): return p*(1-p) node_test = classification_tree.Node(test_data, Gini_index, []) assert node_test.right is None assert node_test.left is None
def test_same_response(): x_data = [[0, 2, 4], [1, 2, 3], [4, 3, 1]] x_df = pd.DataFrame(x_data, columns=['Gender', 'Age', 'Income']) y_data = [1, 1, 1] y_df = pd.DataFrame(y_data, columns=['Pass']) try: ct.Node(x_df, y_df) except AssertionError: "Y dataframe must have 0's and 1's"
def test_impurity_R1(): """ This test is making a one column case, and check its splitting value and splitting point """ data = [[1, 1], [2, 1], [3, 1], [4, 0]] test_case = pd.DataFrame(data, columns=['Age', 'Y']) tree = classification_tree.Node(test_case, 'Y', classification_tree.bayers_error) assert tree.split_value == 4 and tree.split_variable == 'Age', "The splitting point and variables are wrong"
def test_missing_values(): x_data = [[0, 2, 4], [1, None, 3], [4, 2, 1]] x_df = pd.DataFrame(x_data, columns=['Gender', 'Age', 'Income']) y_data = [1, 0, 1] y_df = pd.DataFrame(y_data, columns=['y']) try: ct.Node(x_df, y_df) except AssertionError: "X dataframe must have no missing data"
def test_impurity_R2(): """ This test is making a two columns case, and check its splitting value and splitting point """ data = [[1, 2, 1], [2, 3, 0], [3, 4, 1], [4, 5, 1]] test_case = pd.DataFrame(data, columns=['Age', 'Ticket Fee', 'Y']) tree = classification_tree.Node(test_case, 'Y', classification_tree.cross_entropy) assert tree.split_value == 4 and tree.split_variable == 'Ticket Fee', "The splitting point and variables are wrong" assert tree.left_child.split_value == 3 and tree.left_child.split_variable == 'Ticket Fee', "The splitting point and variables are wrong"
def test_valid_tree(): """ This function is testing whether the tree is built correctly with random dataset. And we use is_valid function to test that. """ data = [[1, 2, 3, 1], [2, 3, 4, 1], [3, 4, 5, 1], [4, 5, 6, 0], [5, 6, 7, 1], [6, 7, 8, 0]] test_case = pd.DataFrame(data, columns=['Age', 'Tciket Fee', 'Shirt Size', 'Y']) tree = classification_tree.Node(test_case, 'Y', classification_tree.gini) assert tree.is_valid(test_case) == True, "The tree should be valid"
def test4(): # impurity function, boundray #making sure the p-value is between 0 and 1 def Cross_entropy(p): if p == 0 or p ==1: return 0 else: return -p * np.log(p) - (1-p) * np.log(1-p) test_data = np.array([[5, 2, 34, 0], [38, 20, 21, 0], [72, 20, 10, 0], [52, 20, 15, 1]]) node_test = classification_tree.Node(test_data, Cross_entropy, []) assert classification_tree.Node.impurity(node_test, 1, Cross_entropy) == 0 assert classification_tree.Node.impurity(node_test, 0, Cross_entropy) == 0
def test2(): # Split feature is the same value. # making sure don't choose the split variable, which only has one value for all rows test_data = np.array([[5, 2, 34, 0], [38, 20, 21, 0], [72, 20, 10, 0], [52, 20, 15, 1]]) def Gini_index(p): return p*(1-p) node_test = classification_tree.Node(test_data, Gini_index, []) #According to the acknowledged node_test's split variables, it has two left split variables, none right split variable assert node_test.split_variable != 'b' assert node_test.left.split_variable != 'b' assert node_test.left.left.split_variable == None assert node_test.right.split_variable == None
def test_impurity_function(): """ This function check how the impurity function handle the dataset with p that equals to 0. """ data = [[1, 2, 3, 4, 0], [2, 3, 4, 6, 0], [3, 5, 6, 7, 0]] test_case = pd.DataFrame( data, columns=['Name', 'Age', 'Ticket Fee', 'Shirt Size', 'Y']) tree = classification_tree.Node(test_case, 'Y', classification_tree.gini) assert tree.get_impurity( test_case, classification_tree.gini) == 0, "The impurity score should be zero"
def test5_R1(): #making sure the split point is in the first column and be the right point. test_data = np.array([[5, 0], [38, 0], [72, 0], [52, 1]]) def Gini_index(p): return p*(1-p) node_test = classification_tree.Node(test_data, Gini_index, []) assert node_test.split_variable == 0 assert node_test.split_point == 72 assert node_test.left.split_variable == 0 assert node_test.left.split_point == 52 assert node_test.right.split_variable == None assert node_test.right.split_point == None
def test_impurity_R3(): """ This test is making a three columns case, and check its splitting value and splitting point """ data = [[1, 2, 3, 1], [2, 3, 4, 1], [3, 4, 5, 1], [4, 5, 6, 0], [5, 6, 7, 1], [6, 7, 8, 0]] test_case = pd.DataFrame(data, columns=['Age', 'Tciket Fee', 'Shirt Size', 'Y']) tree = classification_tree.Node(test_case, 'Y', classification_tree.gini) assert tree.split_value == 6 and tree.split_variable == 'Shirt Size', "The splitting point and variables are wrong" assert tree.right_child.split_value == 8 and tree.right_child.split_variable == 'Shirt Size', "The splitting point and variables are wrong" assert tree.right_child.left_child.split_value == 7 and tree.right_child.left_child.split_variable == 'Shirt Size',\ "The splitting point and variables are wrong"
def test_missing_value(): """ This function is to test how the tree handle the missing value. My assumption is the program will throw an error if it contains missing value. """ data = [[1, 2, 3, 1], [2, 3, 4, 1], [3, 4, 5, 1], [4, 5, 6, 0], [5, 6, 7, 1], [6, 7, 8, 0]] test_case = pd.DataFrame(data, columns=['Age', 'Tciket Fee', 'Shirt Size', 'Y']) test_case[2, 4] = None test_case[3, 3] = None with pytest.raises(AssertionError): classification_tree.Node(test_case, 'Y', classification_tree.gini)
def test7_R3(): test_data = np.array([[16, 24, 34, 1], [38, 20, 21, 0], [14, 20, 10, 1], [52, 20, 15, 1], [32, 59, 1, 0], [34, 21, 69, 1]]) def Gini_index(p): return p*(1-p) node_test = classification_tree.Node(test_data, Gini_index, []) assert node_test.split_variable == 1 assert node_test.split_point == 59 assert node_test.left.split_variable == 0 assert node_test.left.split_point == 34 assert node_test.left.left.split_variable == None assert node_test.left.left.split_point == None assert node_test.left.right.split_variable == 0 assert node_test.left.right.split_point == 38 assert node_test.left.right.right.split_variable == None assert node_test.left.right.left.split_variable == None assert node_test.right.split_variable == None
def test_same_value_x(): x_df = pd.DataFrame([[1, 2], [1, 4], [1, 6], [1, 7]], columns=['var1', 'var2']) y_df = pd.DataFrame([0, 0, 0, 1], columns=['y']) with pytest.warns(UserWarning): ct.Node(x_df, y_df)