def dasgupta_cost(tree, edge_weights, leaf_graph): """ Dasgupta's cost is an unsupervised measure of the quality of a hierarchical clustering of an edge weighted graph. Let :math:`T` be a tree representing a hierarchical clustering of the graph :math:`G=(V, E)`. Let :math:`w` be a dissimilarity function on the edges :math:`E` of the graph. The Dasgupta's cost is define as: .. math:: dasgupta(T, V, E, w) = \sum_{\{x,y\}\in E} \\frac{area(lca_T(x,y))}{w(\{x,y\})} :See: S. Dasgupta. "`A cost function for similarity-based hierarchical clustering <https://arxiv.org/pdf/1510.05043.pdf>`_ ." In Proc. STOC, pages 118–127, Cambridge, MA, USA, 2016 :Complexity: The runtime complexity is :math:`\mathcal{O}(n\log(n) + m)` with :math:`n` the number of nodes in :math:`T` and :math:`m` the number of edges in :math:`E`. :param tree: Input tree :param edge_weights: Edge weights on the leaf graph (dissimilarities) :param leaf_graph: Leaf graph of the input tree (deduced from :class:`~higra.CptHierarchy`) :return: a real number """ area = hg.attribute_area(tree, leaf_graph=leaf_graph) lcaf = hg.make_lca_fast(tree) lca = lcaf.lca(leaf_graph) return np.sum(area[lca] / edge_weights)
def dendrogram_purity_naif(tree, leaf_labels): from itertools import combinations lcaf = hg.make_lca_fast(tree) area = hg.attribute_area(tree) max_label = np.max(leaf_labels) label_histo = np.zeros((tree.num_leaves(), max_label + 1), dtype=np.int64) label_histo[np.arange(tree.num_leaves()), leaf_labels] = 1 label_histo = hg.accumulate_sequential(tree, label_histo, hg.Accumulators.sum) class_purity = label_histo / area[:, None] count = 0 total = 0 for label in set(leaf_labels): same = leaf_labels == label same_indices, = same.nonzero() if len(same_indices) < 2: continue pairs = list(combinations(same_indices, 2)) count += len(pairs) pairs = np.asarray(pairs, dtype=np.int64) lcas = lcaf.lca(pairs[:, 0], pairs[:, 1]) total += np.sum(class_purity[lcas, label]) return total / count
def loss_triplet(graph, edge_weights, ultrametric, hierarchy, triplets, margin): """ Triplet loss regularization with triplet :math:`\mathcal{T}`: .. math:: loss = \sum_{(ref, pos, neg)\in \mathcal{T}} \max(0, ultrametric(ref, pos) - ultrametric(ref, neg) + margin) :param graph: input graph (``higra.UndirectedGraph``) :param edge_weights: edge weights of the input graph (``torch.Tensor``, autograd is supported) :param ultrametric; ultrametric on the input graph (``torch.Tensor``, autograd is supported) :param hierarchy: optional, if provided must be a tuple ``(tree, altitudes)`` corresponding to the result of ``higra.bpt_canonical`` on the input edge weighted graph :param triplets: :param margin: :return: loss value as a pytorch scalar """ tree, altitudes = hierarchy #mst = hg.get_attribute(tree, "mst") #mst_map = hg.get_attribute(mst, "mst_edge_map") lcaf = hg.make_lca_fast(tree) #closest_loss = (ultrametric - edge_weights)**2 pairs, (pos, neg) = triplets pairs_distances = altitudes[lcaf.lca( *pairs)] # ultrametric[mst_map[lcaf.lca(*pairs) - tree.num_leaves()]] triplet_loss = tc.relu(pairs_distances[pos] - pairs_distances[neg] + margin) return triplet_loss.mean()
def attribute_lca_map(tree, leaf_graph): """ Lowest common ancestor of `i` and `j` for each edge :math:`(i, j)` of the leaf graph of the given tree. Complexity: :math:`\mathcal{O}(n\log(n)) + \mathcal{O}(m)` where :math:`n` is the number of nodes in `tree` and :math:`m` is the number of edges in :attr:`leaf_graph`. :param tree: input tree (Concept :class:`~higra.CptHierarchy`) :param leaf_graph: graph on the leaves of the input tree (deduced from :class:`~higra.CptHierarchy` on `tree`) :return: a 1d array """ lca = hg.make_lca_fast(tree) res = lca.lca(leaf_graph) return res