def test8_reponse():
    test_data = np.array([[2, 1, 3], [3, 1, 4], [2, 1, 5], [2, 1, 6], [2, 1, 17]])
    test_data1 = np.array([[2, 1, 0], [3, 1, 4], [2, 1, 0], [2, 1, 0], [2, 1, 0]])
    test_data2 = np.array([[2, 1, 1], [3, 1, 4], [2, 1, 0], [2, 1, 0], [2, 1, 0]])
    test_data3 = np.array([[2, 1, 3], [3, 1, 3], [2, 1, 3], [2, 1, 3], [2, 1, 3]])
    def Gini_index(p):
        return p*(1-p)
    with pytest.raises(ValueError):
        classification_tree.Node(test_data, Gini_index, [])
    with pytest.raises(ValueError):
        classification_tree.Node(test_data1, Gini_index, [])
    with pytest.raises(ValueError):
        classification_tree.Node(test_data2, Gini_index, [])
    with pytest.raises(ValueError):
        classification_tree.Node(test_data3, Gini_index, [])
def test_same_respond():
    """
    This function is to test how the tree handle the same y values.
    """
    test_case = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                             columns=list('ABCD'))
    test_case["Y"] = [0] * 100
    tree = classification_tree.Node(test_case, 'Y', classification_tree.gini)
    assert tree.left_child is None and tree.right_child is None,\
    "The tree should only have one node."

    test_case["Y"] = [1] * 100
    tree = classification_tree.Node(test_case, 'Y', classification_tree.gini)
    assert tree.left_child is None and tree.right_child is None,\
    "The tree should only have one node."
示例#3
0
def import_format_tests_x():
    x_data = [[0, "the"], [0, 1], [2, 3]]
    x_df = pd.DataFrame(x_data, columns=['var1', 'var2'])
    y_data = [0, 1, 0]
    y_df = pd.DataFrame(y_data, columns=['y'])
    with pytest.raises(TypeError):
        ct.Node(x_df, y_df)
示例#4
0
def test_potential_splits():
    x_train = [[1, 2, 3], [4, 5, 6], [6, 7, 8], [1, 1, 2], [-1, 4, 5],
               [0, 0, 1]]
    y_train = [0, 1, 1, 0, 1, 1]
    x_df = pd.DataFrame(x_train, columns=['v1', 'v2', 'v3'])
    y_df = pd.DataFrame(y_train, columns=['y'])
    n = ct.Node(x_df, y_df)
    assert n.potential_splits('v1') == [0, 1, 4, 6, -1]
示例#5
0
def test_best_split():
    x_train = [[1, 2, 3], [4, 5, 6], [6, 7, 8], [1, 1, 2], [-1, 4, 5],
               [0, 0, 1]]
    y_train = [0, 1, 1, 0, 1, 1]
    x_df = pd.DataFrame(x_train, columns=['v1', 'v2', 'v3'])
    y_df = pd.DataFrame(y_train, columns=['y'])
    n = ct.Node(x_df, y_df)
    assert n.best_split() == (2, 'v2')
def test9():
    # making sure dataframe and SQL data base produce the same tree
    def Gini_index(p):
        return p*(1-p)
    test_data2 = np.array([[5, 2, 34, 0], [38, 20, 21, 0], [72, 20, 10, 0], [52, 20, 15, 1]])
    node_test = classification_tree.Node(test_data2, Gini_index, [])
    node_test2 = sql_tree.SQLTree(cur, 'test_data2', ['column_1', 'column_2', 'column_3'], 'response', Gini_index, [])
    assert node_test.past_split == node_test2.past_split
def test3():
    # missing values
    # If there is missing values inside the dataset, throw an error.
    test_data = np.array([[2, None, 0], [3, 1, 0], [2, 1, 0], [2, 1, 0], [2, 1, 1]])
    def Gini_index(p):
        return p*(1-p)
    with pytest.raises(ValueError):
        classification_tree.Node(test_data, Gini_index, [])
示例#8
0
def test_impurity_reduction():
    x_train = [[1, 2, 3], [4, 5, 6], [6, 7, 8], [1, 1, 2], [-1, 4, 5],
               [0, 0, 1]]
    y_train = [0, 1, 1, 0, 1, 1]
    x_df = pd.DataFrame(x_train, columns=['v1', 'v2', 'v3'])
    y_df = pd.DataFrame(y_train, columns=['y'])
    n = ct.Node(x_df, y_df)
    assert float(round(n.impurity_reduction('v2', 3), 2)) == 0.17
def test6_R2():
    test_data = np.array([[5, 34, 1], [38, 7, 1], [72, 15, 0], [52, 29, 1]])
    def Gini_index(p):
        return p*(1-p)
    node_test = classification_tree.Node(test_data, Gini_index, [])
    assert node_test.split_variable == 0
    assert node_test.split_point == 72
    assert node_test.left.split_variable == None
    assert node_test.right.split_variable == None
def test_same_value():
    """
    This function is to test how the program handle the same age value.
    """
    data = [[1, 10, 30, 1], [2, 10, 100, 0], [3, 10, 10, 1], [4, 10, 15, 1]]
    test_case = pd.DataFrame(data, columns=['Name', 'Age', 'Ticket Fee', 'Y'])
    tree = classification_tree.Node(test_case, 'Y', classification_tree.gini)
    assert tree.split_variable == "Ticket Fee",\
    " The Age column shoud be ignored"
def test_wrong_input_2():
    """
    This test is to ensure the import format is as expected,
    by having a not binary y value.
    """
    data = [[1, 10, 30, 1], [2, 10, 100, 3], [3, 10, 10, 1], [4, 10, 15, 1]]
    test_case = pd.DataFrame(data, columns=['Name', 'Age', 'Ticket Fee', 'Y'])
    with pytest.raises(AssertionError):
        classification_tree.Node(test_case, 'Y', classification_tree.gini)
def test_wrong_input_1():
    """
    This function is a test to ensure the import format is as expected
    """
    data = [["Alice", 10, 30, 1], ["Ben", 10, 100, 0], ["Charles", 10, 10, 1],
            ["Don", 10, 15, 1]]
    test_case = pd.DataFrame(data, columns=['Name', 'Age', 'Ticket Fee', 'Y'])
    with pytest.raises(Exception):
        classification_tree.Node(test_case, 'Y', classification_tree.gini)
def test():
    # if responses are all 0 or all 1.
    #Making sure the tree ends there.
    test_data = np.array([[25,36,78,0], [10, 20, 15, 0], [99, 50, 4, 0]])
    def Gini_index(p):
        return p*(1-p)
    node_test = classification_tree.Node(test_data, Gini_index, [])
    assert node_test.right is None
    assert node_test.left is None
示例#14
0
def test_same_response():
    x_data = [[0, 2, 4], [1, 2, 3], [4, 3, 1]]
    x_df = pd.DataFrame(x_data, columns=['Gender', 'Age', 'Income'])
    y_data = [1, 1, 1]
    y_df = pd.DataFrame(y_data, columns=['Pass'])

    try:
        ct.Node(x_df, y_df)
    except AssertionError:
        "Y dataframe must have 0's and 1's"
def test_impurity_R1():
    """
    This test is making a one column case,
    and check its splitting value and splitting point
    """
    data = [[1, 1], [2, 1], [3, 1], [4, 0]]
    test_case = pd.DataFrame(data, columns=['Age', 'Y'])
    tree = classification_tree.Node(test_case, 'Y',
                                    classification_tree.bayers_error)
    assert tree.split_value == 4 and tree.split_variable == 'Age', "The splitting point and variables are wrong"
示例#16
0
def test_missing_values():
    x_data = [[0, 2, 4], [1, None, 3], [4, 2, 1]]
    x_df = pd.DataFrame(x_data, columns=['Gender', 'Age', 'Income'])
    y_data = [1, 0, 1]
    y_df = pd.DataFrame(y_data, columns=['y'])

    try:
        ct.Node(x_df, y_df)
    except AssertionError:
        "X dataframe must have no missing data"
def test_impurity_R2():
    """
    This test is making a two columns case,
    and check its splitting value and splitting point
    """
    data = [[1, 2, 1], [2, 3, 0], [3, 4, 1], [4, 5, 1]]
    test_case = pd.DataFrame(data, columns=['Age', 'Ticket Fee', 'Y'])
    tree = classification_tree.Node(test_case, 'Y',
                                    classification_tree.cross_entropy)
    assert tree.split_value == 4 and tree.split_variable == 'Ticket Fee', "The splitting point and variables are wrong"
    assert tree.left_child.split_value == 3 and tree.left_child.split_variable == 'Ticket Fee', "The splitting point and variables are wrong"
def test_valid_tree():
    """
    This function is testing whether the tree is built correctly with random dataset.
    And we use is_valid function to test that.
    """
    data = [[1, 2, 3, 1], [2, 3, 4, 1], [3, 4, 5, 1], [4, 5, 6, 0],
            [5, 6, 7, 1], [6, 7, 8, 0]]
    test_case = pd.DataFrame(data,
                             columns=['Age', 'Tciket Fee', 'Shirt Size', 'Y'])
    tree = classification_tree.Node(test_case, 'Y', classification_tree.gini)
    assert tree.is_valid(test_case) == True, "The tree should be valid"
def test4():
    # impurity function, boundray
    #making sure the p-value is between 0 and 1
    def Cross_entropy(p):
        if p == 0 or p ==1:
            return 0
        else:
            return -p * np.log(p) - (1-p) * np.log(1-p)
    test_data = np.array([[5, 2, 34, 0], [38, 20, 21, 0], [72, 20, 10, 0], [52, 20, 15, 1]])
    node_test = classification_tree.Node(test_data, Cross_entropy, [])
    assert classification_tree.Node.impurity(node_test, 1, Cross_entropy) == 0
    assert classification_tree.Node.impurity(node_test, 0, Cross_entropy) == 0
def test2():
    # Split feature is the same value.
    # making sure don't choose the split variable, which only has one value for all rows
    test_data = np.array([[5, 2, 34, 0], [38, 20, 21, 0], [72, 20, 10, 0], [52, 20, 15, 1]])
    def Gini_index(p):
        return p*(1-p)
    node_test = classification_tree.Node(test_data, Gini_index, [])
    #According to the acknowledged node_test's split variables, it has two left split variables, none right split variable
    assert node_test.split_variable != 'b'
    assert node_test.left.split_variable != 'b'
    assert node_test.left.left.split_variable == None
    assert node_test.right.split_variable == None
def test_impurity_function():
    """
    This function check how the impurity function handle
    the dataset with p that equals to 0.
    """
    data = [[1, 2, 3, 4, 0], [2, 3, 4, 6, 0], [3, 5, 6, 7, 0]]
    test_case = pd.DataFrame(
        data, columns=['Name', 'Age', 'Ticket Fee', 'Shirt Size', 'Y'])
    tree = classification_tree.Node(test_case, 'Y', classification_tree.gini)
    assert tree.get_impurity(
        test_case,
        classification_tree.gini) == 0, "The impurity score should be zero"
def test5_R1():
    #making sure the split point is in the first column and be the right point.
    test_data = np.array([[5, 0], [38, 0], [72, 0], [52, 1]])
    def Gini_index(p):
        return p*(1-p)
    node_test = classification_tree.Node(test_data, Gini_index, [])
    assert node_test.split_variable == 0
    assert node_test.split_point == 72
    assert node_test.left.split_variable == 0
    assert node_test.left.split_point == 52
    assert node_test.right.split_variable == None
    assert node_test.right.split_point == None
def test_impurity_R3():
    """
    This test is making a three columns case,
    and check its splitting value and splitting point
    """
    data = [[1, 2, 3, 1], [2, 3, 4, 1], [3, 4, 5, 1], [4, 5, 6, 0],
            [5, 6, 7, 1], [6, 7, 8, 0]]
    test_case = pd.DataFrame(data,
                             columns=['Age', 'Tciket Fee', 'Shirt Size', 'Y'])
    tree = classification_tree.Node(test_case, 'Y', classification_tree.gini)
    assert tree.split_value == 6 and tree.split_variable == 'Shirt Size', "The splitting point and variables are wrong"
    assert tree.right_child.split_value == 8 and tree.right_child.split_variable == 'Shirt Size', "The splitting point and variables are wrong"
    assert tree.right_child.left_child.split_value == 7 and tree.right_child.left_child.split_variable == 'Shirt Size',\
     "The splitting point and variables are wrong"
def test_missing_value():
    """
    This function is to test how the tree handle the missing value.
    My assumption is the program will throw an error if it contains
    missing value.
    """
    data = [[1, 2, 3, 1], [2, 3, 4, 1], [3, 4, 5, 1], [4, 5, 6, 0],
            [5, 6, 7, 1], [6, 7, 8, 0]]
    test_case = pd.DataFrame(data,
                             columns=['Age', 'Tciket Fee', 'Shirt Size', 'Y'])
    test_case[2, 4] = None
    test_case[3, 3] = None
    with pytest.raises(AssertionError):
        classification_tree.Node(test_case, 'Y', classification_tree.gini)
def test7_R3():
    test_data = np.array([[16, 24, 34, 1], [38, 20, 21, 0], [14, 20, 10, 1], [52, 20, 15, 1], [32, 59, 1, 0], [34, 21, 69, 1]])
    def Gini_index(p):
        return p*(1-p)
    node_test = classification_tree.Node(test_data, Gini_index, [])
    assert node_test.split_variable == 1
    assert node_test.split_point == 59
    assert node_test.left.split_variable == 0
    assert node_test.left.split_point == 34
    assert node_test.left.left.split_variable == None
    assert node_test.left.left.split_point == None
    assert node_test.left.right.split_variable == 0
    assert node_test.left.right.split_point == 38
    assert node_test.left.right.right.split_variable == None
    assert node_test.left.right.left.split_variable == None
    assert node_test.right.split_variable == None
示例#26
0
def test_same_value_x():
    x_df = pd.DataFrame([[1, 2], [1, 4], [1, 6], [1, 7]],
                        columns=['var1', 'var2'])
    y_df = pd.DataFrame([0, 0, 0, 1], columns=['y'])
    with pytest.warns(UserWarning):
        ct.Node(x_df, y_df)