Exemplo n.º 1
0
def get_nonrep_matrix(tids, rep_ids, dist):
    """
    Get a distance matrix for non-representative species using
    a distance matrix for representative species

    Args:
        tids (array-like)       : taxon IDs for taxa to orient matrix to
        rep_ids (array-like)    : taxon IDs for the representative taxa in *tids*
        dist (DistanceMatrix)   : the distance matrix to orient
    """
    orig_dist = dist
    uniq, counts = np.unique(rep_ids, return_counts=True)
    dist = orig_dist.filter(uniq).data
    extra = counts - 1
    indices = np.where(extra > 0)[0]
    dupes = np.repeat(np.arange(len(uniq)), extra)
    rep_map = dict()
    for rep, const in zip(rep_ids, tids):
        rep_map.setdefault(rep, list()).append(const)
    rep_order = np.concatenate([np.arange(dist.shape[0]), dupes])
    new_tids = [rep_map[uniq[i]].pop() for i in rep_order]
    dupe_dist = duplicate_dmat_samples(dist, dupes)
    ret = ssd.DistanceMatrix(dupe_dist, ids=new_tids)
    ret = ret.filter(tids)
    return ret
Exemplo n.º 2
0
def get_toy_dmat():
    """
    Get a simple skbio DistanceMatrix for testing
    """
    dist = np.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]])
    ids = ['a', 'b', 'c']
    toy_dmat = ssd.DistanceMatrix(dist, ids=ids)
    return toy_dmat
Exemplo n.º 3
0
def update_max_gnid_distances(msd, dm, tree, gnids):
    """Update pairs containing elements of gnids in matrix of maximum distances."""
    data = -msd.data
    tip_names1 = [tip.name for tip in tree.tips()]
    tip_names2 = [
        tip_name for tip_name in tip_names1 if tip_name.split(':')[1] in gnids
    ]
    name2idx = {
        tip_name: msd.index(tip_name.split(':')[1])
        for tip_name in tip_names1
    }
    for tip_name1, tip_name2 in product(tip_names1, tip_names2):
        idx1 = (name2idx[tip_name1], name2idx[tip_name2])
        idx2 = (tip_name1, tip_name2)
        data[idx1] = max(data[idx1], dm[idx2])
        data[idx1[::-1]] = max(data[idx1], dm[idx2])
    np.fill_diagonal(data, 0)  # Set diagonal to 0
    data = np.abs(data)
    return distance.DistanceMatrix(
        data, ids=sorted(msd.ids, key=lambda x: msd.index(x)))  # Ensure sort
Exemplo n.º 4
0
def get_max_gnid_distances(dm):
    """Return matrix of maximum distances between GNIDs."""
    gnid2idx = {}
    name2idx = {}
    for tip_name in dm.ids:
        gnid = tip_name.split(':')[1]
        try:
            idx = gnid2idx[gnid]
        except KeyError:
            idx = len(gnid2idx)
            gnid2idx[gnid] = idx
        name2idx[tip_name] = idx

    data = np.zeros((len(gnid2idx), len(gnid2idx)))
    for tip_name1, tip_name2 in product(dm.ids, repeat=2):
        idx1 = (name2idx[tip_name1], name2idx[tip_name2])
        idx2 = (tip_name1, tip_name2)
        data[idx1] = max(data[idx1], dm[idx2])
    np.fill_diagonal(data, 0)  # Set diagonal to 0
    return distance.DistanceMatrix(
        data, ids=sorted(gnid2idx, key=lambda x: gnid2idx[x]))  # Ensure sort
Exemplo n.º 5
0
def reduce(OGid, OG):
    """Return representative PPIDs for all GNIDs associated with tips in node."""
    # Extract sequences
    seqs = []
    ids = []
    for ppid in OG:
        seqs.append(ppid2seq[ppid])
        gnid, spid, _ = ppid2meta[ppid]
        ids.append(f"'{spid}:{gnid}:{ppid}'"
                   )  # Wrap in quotes to ensure correct parsing

    # Make distance matrix
    k, p = 4, 2  # Tuple size and power
    i, j = 0, 1  # Matrix indices
    dm0 = np.zeros((len(seqs), len(seqs)))
    for seq1, seq2 in combinations(seqs, 2):
        # Calculate distance and store in matrix
        d = get_ktuple_distance(seq1.translate(table), seq2.translate(table),
                                k, p)
        dm0[i, j] = d
        dm0[j, i] = d

        # Calculate indices
        j += 1
        if j > len(seqs) - 1:
            i += 1
            j = i + 1
    dm0 = distance.DistanceMatrix(dm0, ids=ids)

    # Make tree
    tree = skbio.tree.nj(dm0)
    update_tip_names(tree)
    dm = tree.tip_tip_distances()  # Use tree distances for pruning
    msd = get_max_gnid_distances(dm)

    # Prune tree
    gnids = {ppid2meta[ppid][0] for ppid in OG}
    while len(tree.tip_names) > len(gnids):
        # Remove non-minimal tips in single-species clades
        for node in tree.postorder():
            if node.is_tip():
                node.min_tips = {(node.name, node.length)}
            elif len({
                    name.split(':')[1]
                    for child in node.children for name, _ in child.min_tips
            }) == 1:
                name, length = min([
                    min_tip for child in node.children
                    for min_tip in child.min_tips
                ],
                                   key=lambda x: x[1])
                node.min_tips = {(name, length + node.length)}
            else:
                node.min_tips = {
                    min_tip
                    for child in node.children for min_tip in child.min_tips
                }
        min_names = {tip_name for tip_name, _ in tree.min_tips}
        if min_names < tree.tip_names:
            tip_gnids = {
                tip_name.split(':')[1]
                for tip_name in (tree.tip_names - min_names)
            }
            tree = tree.shear(min_names)
            update_tip_names(tree)
            msd = update_max_gnid_distances(msd, dm, tree, tip_gnids)

        # Split tree
        trees = []
        for node in tree.traverse(include_self=False):
            tip_gnids1 = {
                tip_name.split(':')[1]
                for tip_name in node.tip_names
            }
            tip_gnids2 = {
                tip_name.split(':')[1]
                for tip_name in (tree.tip_names - node.tip_names)
            }

            if tip_gnids1 == gnids:
                tree1 = tree.shear(node.tip_names)
                msd1 = update_max_gnid_distances(msd, dm, tree1, tip_gnids2)
                trees.append((tree1, msd1))
            if tip_gnids2 == gnids:
                tree2 = tree.shear(tree.tip_names - node.tip_names)
                msd2 = update_max_gnid_distances(msd, dm, tree2, tip_gnids1)
                trees.append((tree2, msd2))
        if trees:
            tree, msd = min(trees, key=lambda x: x[1].data.sum())
            update_tip_names(tree)

    # Extract sequences from tree
    rOG = [tip_name.split(':') for tip_name in tree.tip_names]
    return OGid, rOG