def test_serialization_fit_model(self): # Setup instance = Tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = GaussianKDE() distribution.fit(X[column]) univariates_matrix[:, i] = distribution.cumulative_distribution(X[column]) instance.fit(index, n_nodes, tau_matrix, univariates_matrix) # Run result = Tree.from_dict(instance.to_dict()) # Check assert result.to_dict() == instance.to_dict()
def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" # Setup # Build first tree data = pd.read_csv('data/iris.data.csv') tau_mat = data.corr(method='kendall').values u_matrix = np.empty(data.shape) for index, col in enumerate(data): uni = GaussianKDE() uni.fit(data[col]) u_matrix[:, index] = uni.cumulative_distribution(data[col]) first_tree = Tree(TreeTypes.CENTER) first_tree.fit(0, 4, tau_mat, u_matrix) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix) tau = first_tree.get_tau_matrix() # Build second tree second_tree = Tree(TreeTypes.CENTER) second_tree.fit(1, 3, tau, first_tree) expected_likelihood_second_tree = 0.4888802429313932 # Run likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first) # Check assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" tau = self.tree.get_tau_matrix() second_tree = Tree(TreeTypes.REGULAR) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u)
class TestDirectTree(TestCase): def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = KDEUnivariate() uni.fit(self.data[col]) self.u_matrix[:, count] = [uni.cumulative_distribution(x) for x in self.data[col]] count += 1 self.tree = Tree(TreeTypes.DIRECT) self.tree.fit(0, 4, self.tau_mat, self.u_matrix) def test_first_tree(self): """ Assert 0 is the center node""" assert self.tree.edges[0].L == 0 def test_first_tree_likelihood(self): """ Assert first tree likehood is correct""" uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) value, new_u = self.tree.get_likelihood(uni_matrix) expected = -0.1207611551427385 assert abs(value - expected) < 10E-3 def test_get_constraints(self): """ Assert get constraint gets correct neighbor nodes""" self.tree._get_constraints() assert self.tree.edges[0].neighbors == [1] assert self.tree.edges[1].neighbors == [0, 2] def test_get_tau_matrix(self): """ Assert none of get tau matrix is NaN """ self.tau = self.tree.get_tau_matrix() test = np.isnan(self.tau) self.assertFalse(test.all()) def test_second_tree_likelihood(self): """ Assert second tree likelihood is correct """ tau = self.tree.get_tau_matrix() second_tree = Tree(TreeTypes.DIRECT) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u) expected = 0.7184205492690413 assert abs(second_value - expected) < 10E-3
def train_vine(self, tree_type): """Build the wine. 1. For the construction of the first tree :math:`T_1`, assign one node to each variable and then couple them by maximizing the measure of association considered. Different vines impose different constraints on this construction. When those are applied different trees are achieved at this level. 2. Select the copula that best fits to the pair of variables coupled by each edge in :math:`T_1`. 3. Let :math:`C_{ij}(u_i , u_j )` be the copula for a given edge :math:`(u_i, u_j)` in :math:`T_1`. Then for every edge in :math:`T_1`, compute either .. math:: {v^1}_{j|i} = \\frac{\\partial C_{ij}(u_i, u_j)}{\\partial u_j} or similarly :math:`{v^1}_{i|j}`, which are conditional cdfs. When finished with all the edges, construct the new matrix with :math:`v^1` that has one less column u. 4. Set k = 2. 5. Assign one node of :math:`T_k` to each edge of :math:`T_ {k−1}`. The structure of :math:`T_{k−1}` imposes a set of constraints on which edges of :math:`T_k` are realizable. Hence the next step is to get a linked list of the accesible nodes for every node in :math:`T_k`. 6. As in step 1, nodes of :math:`T_k` are coupled maximizing the measure of association considered and satisfying the constraints impose by the kind of vine employed plus the set of constraints imposed by tree :math:`T_{k−1}`. 7. Select the copula that best fit to each edge created in :math:`T_k`. 8. Recompute matrix :math:`v_k` as in step 4, but taking :math:`T_k` and :math:`vk−1` instead of :math:`T_1` and u. 9. Set :math:`k = k + 1` and repeat from (5) until all the trees are constructed. """ LOGGER.debug('start building tree : 0') # 1 tree_1 = Tree(tree_type) tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix) self.trees.append(tree_1) LOGGER.debug('finish building tree : 0') for k in range(1, min(self.n_var - 1, self.truncated)): # get constraints from previous tree self.trees[k - 1]._get_constraints() tau = self.trees[k - 1].get_tau_matrix() LOGGER.debug('start building tree: {0}'.format(k)) tree_k = Tree(tree_type) tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1]) self.trees.append(tree_k) LOGGER.debug('finish building tree: {0}'.format(k))
def test_second_tree_likelihood(self): """ Assert second tree likelihood is correct """ tau = self.tree.get_tau_matrix() second_tree = Tree(TreeTypes.CENTER) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u) expected = 0.540089320412914 assert abs(second_value - expected) < 10E-3
def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" tau = self.tree.get_tau_matrix() second_tree = Tree(TreeTypes.DIRECT) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u) expected = 0.24428294700258632 assert abs(second_value - expected) < 10E-3
def train_vine(self, tree_type): LOGGER.debug('start building tree : 0') tree_1 = Tree(tree_type) tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix) self.trees.append(tree_1) LOGGER.debug('finish building tree : 0') for k in range(1, min(self.n_var - 1, self.truncated)): # get constraints from previous tree self.trees[k - 1]._get_constraints() tau = self.trees[k - 1].get_tau_matrix() LOGGER.debug('start building tree: {0}'.format(k)) tree_k = Tree(tree_type) tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1]) self.trees.append(tree_k) LOGGER.debug('finish building tree: {0}'.format(k))
class TestDirectTree(TestCase): def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = GaussianKDE() uni.fit(self.data[col]) self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col]) count += 1 self.tree = Tree(TreeTypes.DIRECT) self.tree.fit(0, 4, self.tau_mat, self.u_matrix) def test_first_tree(self): """ Assert 0 is the center node""" assert self.tree.edges[0].L == 0 @pytest.mark.xfail def test_first_tree_likelihood(self): """ Assert first tree likehood is correct""" uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) value, new_u = self.tree.get_likelihood(uni_matrix) expected = -0.1207611551427385 assert abs(value - expected) < 10E-3 def test_get_constraints(self): """ Assert get constraint gets correct neighbor nodes""" self.tree._get_constraints() assert self.tree.edges[0].neighbors == [1] assert self.tree.edges[1].neighbors == [0, 2] def test_get_tau_matrix_no_edges_empty(self): """get_tau_matrix returns an empty array if there are no edges.""" # Setup tree = Tree(TreeTypes.DIRECT) tree.edges = [] # Run result = tree.get_tau_matrix() # Check assert result.shape == (0, 0) def test_get_tau_matrix(self): """Assert none of get tau matrix is NaN.""" self.tau = self.tree.get_tau_matrix() test = np.isnan(self.tau) self.assertFalse(test.all()) @pytest.mark.xfail def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" tau = self.tree.get_tau_matrix() second_tree = Tree(TreeTypes.DIRECT) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u) expected = 0.24428294700258632 assert abs(second_value - expected) < 10E-3
class TestRegularTree(TestCase): def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = GaussianKDE() uni.fit(self.data[col]) self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col]) count += 1 self.tree = Tree(TreeTypes.REGULAR) self.tree.fit(0, 4, self.tau_mat, self.u_matrix) def test_first_tree(self): """ Assert the construction of first tree is correct The first tree should be: 1 0--2--3 """ sorted_edges = Edge.sort_edge(self.tree.edges) assert sorted_edges[0].L == 0 assert sorted_edges[0].R == 2 assert sorted_edges[1].L == 1 assert sorted_edges[1].R == 2 assert sorted_edges[2].L == 2 assert sorted_edges[2].R == 3 @pytest.mark.xfail def test_first_tree_likelihood(self): """ Assert first tree likehood is correct""" uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) value, new_u = self.tree.get_likelihood(uni_matrix) expected = 0.9545348664739628 assert abs(value - expected) < 10E-3 def test_get_constraints(self): """ Assert get constraint gets correct neighbor nodes""" self.tree._get_constraints() assert self.tree.edges[0].neighbors == [1, 2] assert self.tree.edges[1].neighbors == [0, 2] def test_get_tau_matrix(self): """ Assert second tree likelihood is correct """ self.tau = self.tree.get_tau_matrix() test = np.isnan(self.tau) self.assertFalse(test.all()) def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" tau = self.tree.get_tau_matrix() second_tree = Tree(TreeTypes.REGULAR) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u)
def test_to_dict_fit_model(self): # Setup instance = Tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = GaussianKDE() distribution.fit(X[column]) univariates_matrix[:, i] = distribution.cumulative_distribution(X[column]) instance.fit(index, n_nodes, tau_matrix, univariates_matrix) expected_result = { 'type': 'copulas.multivariate.tree.RegularTree', 'fitted': True, 'level': 1, 'n_nodes': 3, 'previous_tree': [ [0.8230112726144534, 0.3384880496294825, 0.3384880496294825], [0.3384880496294825, 0.8230112726144534, 0.3384880496294825], [0.3384880496294825, 0.3384880496294825, 0.8230112726144534] ], 'tau_matrix': [ [1.0, -0.49999999999999994, -0.49999999999999994], [-0.49999999999999994, 1.0, -0.49999999999999994], [-0.49999999999999994, -0.49999999999999994, 1.0] ], 'tree_type': TreeTypes.REGULAR, 'edges': [ { 'index': 0, 'D': set(), 'L': 0, 'R': 1, 'U': [ [0.7969535322648066, 0.6887525261721343, 0.12077958383821545], [0.6887525261721343, 0.7969535322648066, 0.12077958383821545] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 }, { 'index': 1, 'D': set(), 'L': 1, 'R': 2, 'U': [ [0.12077958383821545, 0.7969535322648066, 0.6887525261721343], [0.12077958383821545, 0.6887525261721343, 0.7969535322648066] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 } ], } # Run result = instance.to_dict() # Check compare_nested_dicts(result, expected_result)
class TestCenterTree(TestCase): def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = GaussianKDE() uni.fit(self.data[col]) self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col]) count += 1 self.tree = Tree(TreeTypes.CENTER) self.tree.fit(0, 4, self.tau_mat, self.u_matrix) def test_first_tree(self): """Assert 0 is the center node on the first tree.""" assert self.tree.edges[0].L == 0 @pytest.mark.xfail def test_first_tree_likelihood(self): """Assert first tree likehood is correct.""" uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) value, new_u = self.tree.get_likelihood(uni_matrix) expected = -0.19988720707143634 assert abs(value - expected) < 10E-3 def test_get_constraints(self): """Assert get constraint gets correct neighbor nodes.""" self.tree._get_constraints() assert self.tree.edges[0].neighbors == [1, 2] assert self.tree.edges[1].neighbors == [0, 2] def test_get_tau_matrix(self): """Assert none of get tau matrix is NaN.""" self.tau = self.tree.get_tau_matrix() test = np.isnan(self.tau) self.assertFalse(test.all()) @pytest.mark.xfail def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" # Setup # Build first tree data = pd.read_csv('data/iris.data.csv') tau_mat = data.corr(method='kendall').values u_matrix = np.empty(data.shape) for index, col in enumerate(data): uni = GaussianKDE() uni.fit(data[col]) u_matrix[:, index] = uni.cumulative_distribution(data[col]) first_tree = Tree(TreeTypes.CENTER) first_tree.fit(0, 4, tau_mat, u_matrix) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix) tau = first_tree.get_tau_matrix() # Build second tree second_tree = Tree(TreeTypes.CENTER) second_tree.fit(1, 3, tau, first_tree) expected_likelihood_second_tree = 0.4888802429313932 # Run likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first) # Check assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
def test_to_dict_fit_model(self): # Setup instance = Tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = KDEUnivariate() distribution.fit(X[column]) univariates_matrix[:, i] = [distribution.cumulative_distribution(x) for x in X[column]] instance.fit(index, n_nodes, tau_matrix, univariates_matrix) expected_result = { 'type': 'copulas.multivariate.tree.RegularTree', 'fitted': True, 'level': 1, 'n_nodes': 3, 'previous_tree': [ [0.8230112726144534, 0.3384880496294825, 0.3384880496294825], [0.3384880496294825, 0.8230112726144534, 0.3384880496294825], [0.3384880496294825, 0.3384880496294825, 0.8230112726144534] ], 'tau_matrix': [ [1.0, -0.49999999999999994, -0.49999999999999994], [-0.49999999999999994, 1.0, -0.49999999999999994], [-0.49999999999999994, -0.49999999999999994, 1.0] ], 'tree_type': TreeTypes.REGULAR, 'edges': [ { 'D': set(), 'L': 0, 'R': 1, 'U': [ [6.533235975920359, 6.425034969827687, 5.857062027493768], [6.425034969827687, 6.533235975920359, 5.857062027493768] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 }, { 'D': set(), 'L': 1, 'R': 2, 'U': [ [5.857062027493768, 6.533235975920359, 6.425034969827687], [5.857062027493768, 6.425034969827687, 6.533235975920359] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 } ], } # Run result = instance.to_dict() # Check compare_nested_dicts(result, expected_result)