Exemplo n.º 1
0
 def check_is_valid_linkage_various_size(self, nrow, ncol, valid):
     # Tests is_valid_linkage(Z) with linkage matrics of various sizes
     Z = np.asarray([[0, 1, 3.0, 2, 5], [3, 2, 4.0, 3, 3]], dtype=np.double)
     Z = Z[:nrow, :ncol]
     assert_(is_valid_linkage(Z) == valid)
     if not valid:
         assert_raises(ValueError, is_valid_linkage, Z, throw=True)
Exemplo n.º 2
0
 def test_is_valid_linkage_4_and_up(self):
     # Tests is_valid_linkage(Z) on linkage on observation sets between
     # sizes 4 and 15 (step size 3).
     for i in xrange(4, 15, 3):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         assert_(is_valid_linkage(Z) == True)
Exemplo n.º 3
0
 def test_is_valid_linkage_4_and_up_neg_counts(self):
     # Tests is_valid_linkage(Z) on linkage on observation sets between
     # sizes 4 and 15 (step size 3) with negative counts.
     for i in xrange(4, 15, 3):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         Z[i//2,3] = -2
         assert_(is_valid_linkage(Z) == False)
         assert_raises(ValueError, is_valid_linkage, Z, throw=True)
Exemplo n.º 4
0
 def test_is_valid_linkage_empty(self):
     # Tests is_valid_linkage(Z) with empty linkage.
     Z = np.zeros((0, 4), dtype=np.double)
     assert_(is_valid_linkage(Z) == False)
     assert_raises(ValueError, is_valid_linkage, Z, throw=True)
Exemplo n.º 5
0
 def test_is_valid_linkage_int_type(self):
     # Tests is_valid_linkage(Z) with integer type.
     Z = np.asarray([[0, 1, 3.0, 2],
                     [3, 2, 4.0, 3]], dtype=np.int)
     assert_(is_valid_linkage(Z) == False)
     assert_raises(TypeError, is_valid_linkage, Z, throw=True)
Exemplo n.º 6
0
def _to_dtw_tree(linkage, hierarchical_clustering_object, prototypes, prototyping_function='mean'):
    """
    Converts a hierarchical clustering linkage matrix `linkage` to hierarchy of `DTWClusterNode`s.
    This is a modification of `scipy.cluster.hierarchy.to_tree` function and the code is mostly taken from it.

    :param linkage: linkage matrix to convert to the DTW Tree
    :param hierarchical_clustering_object: hierarchical clustering object to work with
    :param prototyping_function: "reduce" function for prototype calculation, or "mean" to simply use data mean
    """

    # Validation
    linkage = np.asarray(linkage, order='c')
    hierarchy.is_valid_linkage(linkage, throw=True, name='Z')

    data = hierarchical_clustering_object.data
    labels = data.items
    values = data.ix

    n = linkage.shape[0] + 1

    # Create a list full of None's to store the node objects
    d = [None] * (n * 2 - 1)

    # Create the nodes corresponding to the n original objects.
    for i in xrange(0, n):
        index = labels[i]
        d[i] = DTWClusterNode(id=index, hierarchical_clustering_object=hierarchical_clustering_object,
                              prototype=values[index])

    nd = None

    for i in xrange(0, n - 1):
        fi = int(linkage[i, 0])
        fj = int(linkage[i, 1])

        assert(fi <= i + n)
        assert(fj <= i + n)

        id = i + n
        left = d[fi]
        right = d[fj]
        dist = linkage[i, 2]

        if prototypes:
            prototype = prototypes[id]

            nd = DTWClusterNode(id=id, hierarchical_clustering_object=hierarchical_clustering_object,
                                prototype=prototype,
                                left=left, right=right,
                                dist=linkage[i, 2])

        elif callable(prototyping_function):
            prototype = prototyping_function(left.prototype.values, right.prototype.values, left.count, right.count)

            nd = DTWClusterNode(id=id, hierarchical_clustering_object=hierarchical_clustering_object,
                                prototype=prototype,
                                left=left, right=right,
                                dist=linkage[i, 2])

        elif prototyping_function == 'mean':
            nd = DTWClusterNode(id=id, hierarchical_clustering_object=hierarchical_clustering_object,
                                prototype=None,
                                left=left, right=right,
                                dist=linkage[i, 2])

            # A bit hacky, but does job. Doing this as to get to use nd.data
            nd._prototype = nd.data.mean()

        assert(linkage[i, 3] == nd.count)
        d[n + i] = nd

    return nd, d
Exemplo n.º 7
0
 def test_is_valid_linkage_int_type(self):
     # Tests is_valid_linkage(Z) with integer type.
     Z = np.asarray([[0, 1, 3.0, 2],
                     [3, 2, 4.0, 3]], dtype=np.int)
     assert_(is_valid_linkage(Z) == False)
     assert_raises(TypeError, is_valid_linkage, Z, throw=True)
Exemplo n.º 8
0
 def test_is_valid_linkage_empty(self):
     # Tests is_valid_linkage(Z) with empty linkage.
     Z = np.zeros((0, 4), dtype=np.double)
     assert_(is_valid_linkage(Z) == False)
     assert_raises(ValueError, is_valid_linkage, Z, throw=True)
def assign_domain_cluster_to_compartments(coordinates,
                                          domain_starts,
                                          compartment_dict,
                                          domain_linkage=None,
                                          linkage_method='complete',
                                          distance_metric='median',
                                          normalization=None,
                                          min_cluster_size_ratio=0.1,
                                          min_cluster_dist_ratio=0.08,
                                          assign_method='binary',
                                          return_boundary=True,
                                          verbose=True):
    """Function to assign domain clusters to given compartments in compartment_dict
    Idea: 1. find normalized overlap ratio between domain_cluster and reference_compartment, 
          2. assign bestmatch for each cluster
    ------------------------------------------------------------------------------------------
    Inputs:
        coordinates: distance map or zxy coordinates for a chromosome, np.ndarray (or like)
        domain_starts: indices of domain start regions in this chromosome, np.ndarray(1d)
        compartment_dict: dictionary for compartment annotation, dict
            Note: this comaprtment_dict has to be exclusive
        domain_linkage: linkage matrix generated from scipy.cluster.hierarchy.linkage, np.ndarray
            (linkage result, default:None, generate from scratch)
        linkage_method: method for linkage if domain_linkage is not given, str (default: 'complete')
        distance_metric: metric for domain distance calculation, str (default: 'median')
        min_cluster_size_ratio: minimal size of cluster ratio to chromosome size, float (default: 0.1)
        min_cluster_dist_ratio: minimal distance of cluster ratio to number of domains, float (default: 0.05)
        assign_method: method for assigning compartments, str {'binary'|'continuous'}
        verbose: whether say something!, bool (default: True)
    Output:
        _assigned_dict: assigned compartment label -> region id list dictionary, dict
    """
    ## check inputs
    # coordinate
    coordinates = np.array(coordinates)
    if verbose:
        print(f"-- assign domain-clusters to compartments with", end=' ')
    if len(np.shape(coordinates)) != 2:
        raise ValueError(
            f"Wrong input shape for coordinates, should be 2d but {len(np.shape(coordinates))} is given"
        )
    elif np.shape(coordinates)[0] == np.shape(coordinates)[1]:
        if verbose:
            print(f"distance map")
        _mat = coordinates
    elif np.shape(coordinates)[1] == 3:
        if verbose:
            print(f"3d coordinates")
        _mat = squareform(pdist(coordinates))
    else:
        raise ValueError(
            f"Input coordinates should be distance-matrix or 3d-coordinates!")
    # domain_starts
    domain_starts = np.array(domain_starts, dtype=np.int)
    for _s in domain_starts:
        if _s < 0 or _s > _mat.shape[0]:
            raise ValueError(
                f"Wrong input domain_starts: {_s}, should be index of coordinates"
            )
    domain_ends = np.zeros(np.shape(domain_starts))
    domain_ends[:-1] = domain_starts[1:]
    domain_ends[-1] = _mat.shape[0]
    # compartment_dict
    _ref_inds = []
    for _k, _v in compartment_dict.items():
        _ref_inds += list(_v)
    _uids, _ucounts = np.unique(_ref_inds, return_counts=True)
    if (_ucounts > 1).any():
        raise ValueError(
            f"There are non unique ids used in reference:{compartment_dict}")
    elif (_uids > _mat.shape[0]).any():
        raise ValueError(
            f"Wrong ind given in compartment_dict:{compartment_dict}, should be index of coordinates"
        )
    # domain_linkage
    if domain_linkage is not None and not is_valid_linkage(domain_linkage):
        raise ValueError(
            f"domain_liknage should be a linkage type array from scipy.cluster.hierarchy.linkage"
        )
    elif domain_linkage is None:
        _dom_pdists = domain_tools.distance.domain_pdists(
            coordinates,
            domain_starts,
            metric=distance_metric,
            normalization_mat=normalization)
        _cov_mat = np.corrcoef(squareform(_dom_pdists))
        try:
            domain_linkage = linkage(_cov_mat, method=linkage_method)
        except ValueError:
            print(f"failed to build linkage, exit.")
            if return_boundary:
                return None, None
            else:
                return None
    # assign_method
    _allowed_assign_method = ['binary', 'continuous']
    assign_method = str(assign_method).lower()
    if assign_method not in _allowed_assign_method:
        raise ValueError(
            f"Wrong input assign_method:{assign_method}, should be within {_allowed_assign_method}"
        )
    ## 1. acquire exclusive clusters
    # get all subnodes
    _rootnode, _nodelist = to_tree(domain_linkage, rd=True)
    # get selection threshold
    _dist_th = len(domain_starts) * min_cluster_dist_ratio
    if verbose:
        print(f"--- threshold for cluster distance={_dist_th}")
    # init kept clusters
    _kept_clusters = []
    for _node in _nodelist:
        _kept_leafs = []
        for _n in _kept_clusters:
            _kept_leafs += list(_n.pre_order(lambda x: x.id))
        _left_flag, _right_flag = True, True
        if not _node.is_leaf() and _node.dist > _dist_th:
            for _r in _node.left.pre_order(lambda x: x.id):
                if _r in _kept_leafs:
                    _left_flag = False
                    continue
            for _r in _node.right.pre_order(lambda x: x.id):
                if _r in _kept_leafs:
                    _right_flag = False
                    continue
            # otherwise, keep
            if _left_flag:
                _kept_clusters.append(_node.left)
            if _right_flag:
                _kept_clusters.append(_node.right)
    # convert domain ID to region_id
    _reg_id_list = []
    for _n in _kept_clusters:
        _dom_ids = np.array(_n.pre_order(lambda x: x.id), dtype=np.int)
        _reg_ids = [
            np.arange(domain_starts[_d], domain_ends[_d]).astype(np.int)
            for _d in _dom_ids
        ]
        _reg_id_list.append(np.concatenate(_reg_ids))

    ## 2. with selected clusters, calculate its overlap with compartments
    # init
    _decision_dict = {
        _k: np.zeros(len(_reg_id_list))
        for _k in compartment_dict.keys()
    }
    for _ckey, _cinds in compartment_dict.items():
        for _j, _rids in enumerate(_reg_id_list):
            _decision_dict[_ckey][_j] = len(np.intersect1d(
                _rids, _cinds)) / len(_rids) / len(_cinds)
    if verbose:
        print("--- decision_dict:", _decision_dict)

    ## summarize to a dict
    _assigned_dict = {
        _k: np.zeros(_mat.shape[0])
        for _k in compartment_dict.keys()
    }
    _keys = list(compartment_dict.keys())
    if assign_method == 'binary':
        for _j, _rids in enumerate(_reg_id_list):
            _match_ind = np.argmax(
                [_v[_j] for _k, _v in _decision_dict.items()])
            _assigned_dict[_keys[_match_ind]][_rids] = 1
    elif assign_method == 'continuous':
        _norm_mat = np.stack([_v for _k, _v in _decision_dict.items()])
        _norm_mat = _norm_mat / np.sum(_norm_mat, 0)
        _norm_mat[np.isnan(_norm_mat)] = 0
        for _j, _rids in enumerate(_reg_id_list):
            for _i, _k in enumerate(_keys):
                _assigned_dict[_k][_rids] = _norm_mat[_i, _j]
    # return
    if return_boundary:
        # calculate compartment boundaries
        _boundary_dict = {_k: [] for _k in compartment_dict.keys()}
        for _k, _v in _assigned_dict.items():
            _bds = np.where((_v[1:] - _v[:-1]) > 0)[0] + 1
            _boundary_dict[_k] = _bds
        _cluster_bds = np.concatenate(list(_boundary_dict.values()))
        _cluster_bds = np.unique(_cluster_bds)
        return _assigned_dict, _cluster_bds
    else:
        return _assigned_dict
Exemplo n.º 10
0
def fcluster_combine_leaves(Z,
                            t,
                            criterion="distance",
                            depth=2,
                            R=None,
                            monocrit=None):
    # AKA no leaf left behind

    # check if Z is a valid linkage matrix
    _ = hierarchy.is_valid_linkage(Z, throw=True)

    N = Z.shape[0] + 1

    # alternative: iteratively increase t, check for remaining leaves

    # move up the tree, merging leaf clusters until all leaves are merged into clusters
    T = hierarchy.fcluster(Z,
                           t,
                           criterion=criterion,
                           depth=depth,
                           R=R,
                           monocrit=monocrit)
    L, M = hierarchy.leaders(Z, T)
    leaf_leaders = list(L[L < N])

    # no leaf clusters
    if len(leaf_leaders) == 0:
        return T

    max_cluster = T.max()

    # iterate through all links
    for n, link in enumerate(
            Z[np.logical_or(*(np.in1d(Z[:, l], leaf_leaders)
                              for l in range(2))), :2].astype("i")):

        if n % 10 == 0:
            print(
                f"After {n} iterations, {len(leaf_leaders)} leaf leaders left with {len(np.unique(T))} total clusters"
            )

        # find linkages if link is between two leaf_leaders
        if all([l in leaf_leaders for l in link]):
            # make new cluster of leaf leaders
            max_cluster += 1
            T[link] = max_cluster

            # remove from list of leaf_leaders
            _ = [leaf_leaders.remove(l) for l in link]

        # find linkages of leaf leaders with any non-leaf node
        elif any([l in leaf_leaders for l in link]):

            # which one is the leaf leader?
            node_index = link[0] in leaf_leaders
            node, leaf = link[int(node_index)], link[int(~node_index)]

            # other node is a leader
            if node in L:
                downstream_leaders = [node]

            # node is not a leader, have to traverse down the tree until leaders are found
            else:
                # get hierarchy.ClusterNode representation of the node
                tree = hierarchy.to_tree(Z, rd=True)[1][node]

                def check_node(node, nodes_to_check, downstream_leaders, L):
                    """check if a node is a leader, else append successors to nodes_to_check"""
                    if node.id in L:
                        downstream_leaders.append(node.id)
                    else:
                        nodes_to_check.extend([node.left, node.right])
                    return nodes_to_check, downstream_leaders

                # initialize traversal
                downstream_leaders = []
                nodes_to_check = [tree.left, tree.right]

                while len(nodes_to_check) > 0:
                    n_ = nodes_to_check.pop(0)
                    if all([s is None for s in [n_.left, n_.right]]):
                        raise ValueError(
                            "While traversing the tree, a leaf node was reached"
                            f", node {n_.id}. In theory this should not occur."
                        )
                    nodes_to_check, downstream_leaders = check_node(
                        n_, nodes_to_check, downstream_leaders, L)

            # update T
            max_cluster += 1
            merge_clusters = M[np.in1d(L, downstream_leaders)]
            T[np.in1d(T, merge_clusters)] = max_cluster
            T[leaf] = max_cluster

            # remove from leaf_leaders
            _ = leaf_leaders.remove(leaf)

        else:
            continue

        # update L,M
        L, M = hierarchy.leaders(Z, T)

        if len(leaf_leaders) == 0:
            break

    leaf_leaders = list(L[L < N])

    # no leaf clusters
    if len(leaf_leaders) == 0:
        print(
            f"All leaf leaders combined, resulting in {len(np.unique(T))} total clusters"
        )

        # relabel
        unique, inverse = np.unique(T, return_inverse=True)

        return np.arange(0, unique.shape[0])[inverse]
    else:
        raise ValueError(f"Failed to merge leaf leaders {leaf_leaders}")