예제 #1
0
def test_two_leaves():
    np.random.seed(999)
    df = pd.DataFrame([
        # x1, x2, y         stratify x1, consider y ~ x2
         [1,  3,  5],
         [2,  4,  6],
         [1,  4,  5],
         [2,  2,  4]
    ], columns=['x1','x2','y'])
    X = df.drop('y', axis=1)
    y = df['y']

    leaves = get_leaves(X, y, 'x2') # get index of samples in each leaf
    expected_leaves = [np.array([0, 2]),  # leaf 0
                       np.array([1, 3])]  # leaf 1
    np.testing.assert_array_equal(leaves, expected_leaves)

    leaf_deltas, leaf_counts, refcats, ignored = stratify_cats(X,y,colname="x2", min_samples_leaf=2)
    expected_leaf_deltas = np.array([[nan, nan],    # 0
                                     [nan, nan],    # 1
                                     [nan, 0],      # 2
                                     [0,   nan],    # 3
                                     [0,   2]])     # 4
    expected_leaf_counts = np.array([[0,   0],
                                     [0,   0],
                                     [0,   1],
                                     [1,   0],
                                     [1,   1]])
    expected_refcats = np.array([4, 2])
    np.testing.assert_array_almost_equal(leaf_deltas, expected_leaf_deltas, decimal=1)
    np.testing.assert_array_equal(leaf_counts, expected_leaf_counts)
    np.testing.assert_array_equal(refcats, expected_refcats)
    assert ignored==0
예제 #2
0
def test_single_leaf():
    np.random.seed(999)
    df = pd.DataFrame([
        # x1, x2, y         stratify x1, consider y ~ x2
         [1,  3,  5],
         [1,  4,  6],
         [1,  4,  5],
         [1,  2,  4]
    ], columns=['x1','x2','y'])
    X = df.drop('y', axis=1)
    y = df['y']

    leaves = get_leaves(X, y, 'x2') # get index of samples in each leaf
    expected_leaves = [np.array([0, 1, 2, 3])]  # leaf 0
    np.testing.assert_array_equal(leaves, expected_leaves)

    leaf_deltas, leaf_counts, refcats, ignored = stratify_cats(X,y,colname="x2", min_samples_leaf=4)

    expected_leaf_deltas = np.array([nan, nan, -1, 0, .5]).reshape(-1,1)
    expected_leaf_counts = np.array([0, 0, 1, 1, 2]).reshape(-1,1)
    expected_refcats = np.array([3])
    np.testing.assert_array_almost_equal(leaf_deltas, expected_leaf_deltas, decimal=1)
    np.testing.assert_array_equal(leaf_counts, expected_leaf_counts)
    np.testing.assert_array_equal(refcats, expected_refcats)
    assert ignored==0
예제 #3
0
def test_three_leaves_no_overlap():
    np.random.seed(999)
    df = pd.DataFrame(
        [
            # x1, x2, y         stratify x1, consider y ~ x2
            [1, 2, 9],
            [1, 3, 7],
            [3, 4, 6],
            [3, 5, 5],
            [4, 6, 4],
            [4, 7, 3]
        ],
        columns=['x1', 'x2', 'y'])
    X = df.drop('y', axis=1)
    y = df['y']

    leaves = get_leaves(
        X, y, 'x2', min_samples_leaf=2)  # get index of samples in each leaf
    expected_leaves = [
        np.array([0, 1]),  # leaf 0
        np.array([2, 3]),  # leaf 1
        np.array([4, 5])
    ]  # leaf 2
    np.testing.assert_array_equal(leaves, expected_leaves)

    leaf_deltas, leaf_counts, ignored = stratify_cats(X,
                                                      y,
                                                      colname="x2",
                                                      min_samples_leaf=2)
    print(leaf_deltas, leaf_counts)
    expected_leaf_deltas = np.array([
        [nan, nan, nan],  # cat 0
        [nan, nan, nan],  # cat 2
        [2, nan, nan],  # cat 3
        [0, nan, nan],  # cat 4
        [nan, 1, nan],
        [nan, 0, nan],
        [nan, nan, 1],
        [nan, nan, 0]
    ])
    expected_leaf_counts = np.array([[0, 0, 0], [0, 0, 0], [1, 0,
                                                            0], [1, 0, 0],
                                     [0, 1, 0], [0, 1, 0], [0, 0, 1],
                                     [0, 0, 1]])
    np.testing.assert_array_almost_equal(leaf_deltas,
                                         expected_leaf_deltas,
                                         decimal=1)
    np.testing.assert_array_equal(leaf_counts, expected_leaf_counts)
    assert ignored == 0
예제 #4
0
def test_two_leaves_with_2nd_ignored():
    np.random.seed(999)
    df = pd.DataFrame(
        [
            # x1, x2, y         stratify x1, consider y ~ x2
            [1, 3, 5],
            [1, 4, 6],
            [2, 4, 7],
            [2, 4, 8]
        ],
        columns=['x1', 'x2', 'y'])
    X = df.drop('y', axis=1)
    y = df['y']
    """
    Second leaf is indexes 2,3 of x2, which has same x value. must ignore so
    there is one leaf, with cats 3 and 4:
    
    leaf_deltas
    [[nan]
     [nan]
     [nan]
     [ 0.]
     [ 1.]]
     """
    leaves = get_leaves(X, y, 'x2')  # get index of samples in each leaf
    expected_leaves = [
        np.array([0, 1]),  # leaf 0
        np.array([2, 3])
    ]  # leaf 1
    np.testing.assert_array_equal(leaves, expected_leaves)

    leaf_deltas, leaf_counts, ignored = stratify_cats(X,
                                                      y,
                                                      colname="x2",
                                                      min_samples_leaf=2)
    expected_leaf_deltas = np.array([[nan, nan], [nan, nan], [nan, nan],
                                     [0, nan], [1, 0]])
    expected_leaf_counts = np.array([[0, 0], [0, 0], [0, 0], [1, 0], [1, 2]])
    np.testing.assert_array_almost_equal(leaf_deltas,
                                         expected_leaf_deltas,
                                         decimal=1)
    np.testing.assert_array_equal(leaf_counts, expected_leaf_counts)
    assert ignored == 0
예제 #5
0
def speed_ModelID():
    "I believe none of this is in the JIT path; repeated runs are same speed"
    np.random.seed(1)

    n = 20_000
    min_samples_leaf = 5
    X, y = load_bulldozer(n=n)

    leaf_deltas, leaf_counts, ignored = \
        stratify_cats(X,y,colname="ModelID",min_samples_leaf=min_samples_leaf)

    start = timer()
    avg_values_at_cat(leaf_deltas, leaf_counts, max_iter=10)
    stop = timer()

    nunique = len(np.unique(X['ModelID']))
    print(
        f"n={n}, unique cats {nunique}, min_samples_leaf={min_samples_leaf}: avg_values_at_cat {stop - start:.3f}s"
    )