def test_apply_until_unbalanced(unbalanced_tree): X, _, tree = unbalanced_tree # all samples are in the root expected = np.ones((9, 1)) root = tree_utils.apply_until(tree, X, depth=0).toarray() np.testing.assert_allclose(root, expected) # three samples go left the rest go right expected = np.array([[1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]) depth_one = tree_utils.apply_until(tree, X, depth=1).toarray() np.testing.assert_allclose(depth_one, expected) # the remaing six are split at the next level expected = np.array([[1, 0, 0], [1, 0, 0], [1, 0 ,0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1]]) depth_two = tree_utils.apply_until(tree, X, depth=2).toarray() np.testing.assert_allclose(depth_two, expected)
def test_node_similarity_XY_long_similarity(unbalanced_tree): X, _, tree = unbalanced_tree Y = X[:2] nodes_X = tree_utils.apply_until(tree, X, depth=2) nodes_Y = tree_utils.apply_until(tree, Y, depth=2) S_expected = np.array([[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0, 0]]).T S = tree_utils.node_similarity(nodes_X, nodes_Y) assert S.shape == (len(X), len(Y)) np.testing.assert_allclose(S, S_expected)
def test_apply_until_balanced(balanced_tree): X, _, tree = balanced_tree # all samples are in the root expected = np.ones((6, 1)) root = tree_utils.apply_until(tree, X, depth=0).toarray() np.testing.assert_allclose(root, expected) # three samples go left and three samples go right expected = np.array([[1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1]]) depth_one = tree_utils.apply_until(tree, X, depth=1).toarray() np.testing.assert_allclose(depth_one, expected)
def test_apply_until_does_not_drop_columns(unbalanced_tree): """Test that if X does not occupy all nodes the indicator matrix still includes them.""" X, _, tree = unbalanced_tree expected = np.array([[1, 0, 0], [1, 0, 0]]) depth_two = tree_utils.apply_until(tree, X[:2], depth=2).toarray() np.testing.assert_allclose(depth_two, expected)
def test_node_similarity_balanced(balanced_tree): X, _, tree = balanced_tree # this is a balanced tree so targets are grouped together S_expected = np.array([[1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]]) node_indicators = tree_utils.apply_until(tree, X, depth=-1) S = tree_utils.node_similarity(node_indicators) np.testing.assert_allclose(S, S_expected)
def test_apply_until_negative_depth(unbalanced_tree): """Test depth == -1 returns the leaf nodes""" X, _, tree = unbalanced_tree expected = np.array([[1, 0, 0], [1, 0, 0], [1, 0 ,0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1]]) depth_two = tree_utils.apply_until(tree, X, depth=-1).toarray() np.testing.assert_allclose(depth_two, expected)
def test_apply_until_too_high_depth(unbalanced_tree): """Test a depth larger than max_depth returns leaf nodes.""" X, _, tree = unbalanced_tree # the remaing six are split at the next level expected = np.array([[1, 0, 0], [1, 0, 0], [1, 0 ,0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 0, 1], [0, 0, 1], [0, 0, 1]]) depth_two = tree_utils.apply_until(tree, X, depth=100).toarray() np.testing.assert_allclose(depth_two, expected)
def random_partitions_kernel(forest, X, Y=None, tree_depths='random', random_state=123): """Random Partition Kernel induced by an ensemble of decision trees. A random partition kernel is a kernel-function induced by a distribution over partitions (or random partitions) of a dataset. Since an ensemble of trees such as a random-forest partitions a dataset into groups (the tree nodes), these models can be thought of random partition generators and so induce a kernel-function. By repeatedly cutting a data-set into random partitions we would expect data points that are similar to each other to be grouped together more often then other samples. Likewise nodes in the decision tree should contain similar datapoints. In order to sample the whole hierachal structure of the forest a depth is chosen at random to sample and then the common partitions are added up. The kernel is as follows: Number of times x_i and x_j occur in the same node K(i, j) = -------------------------------------------------- Total number of trees in the ensemble Parameters ---------- forest: A class instance derived from `sklearn.ensemble.BaseForest`. The forest from which the kernel is calculated. X: array-like, shape = [n_samples, n_features] The data to train the kernel on. tree_depths: list or str, optional (default='random') A list of depths to use for each tree. if `tree_depths`='random' then the depths are randomly sampled from a discrete uniform distribution between 1 and max_depth. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- K : array-like, shape = [n_samples, n_samples] A kernel matrix K such that K_{i, j} is the similarity between the ith and jth vectors. [1] References ---------- .. [1] A. Davis, Z. Ghahramani, "The Random Forest Kernel and creating other kernels for big data from random partitions", CoRR, 2014. """ X = check_array(X, accept_sparse='csc') if Y is not None: Y = check_array(Y, accept_sparse='csr') if tree_depths == 'random': tree_depths = sample_depths(forest, random_state=random_state) n_samples_x = X.shape[0] n_samples_y = Y.shape[0] if Y is not None else n_samples_x kernel = np.zeros(shape=(n_samples_x, n_samples_y)) for tree_idx, tree in enumerate(forest.estimators_): node_indicator_X = tree_utils.apply_until(tree, X, depth=tree_depths[tree_idx]) if Y is not None: node_indicator_Y = tree_utils.apply_until( tree, Y, depth=tree_depths[tree_idx]) else: node_indicator_Y = node_indicator_X kernel += tree_utils.node_similarity(node_indicator_X, node_indicator_Y) return kernel / len(forest.estimators_)