示例#1
0
 def test_from_iterable_validate_equal_valid_data(self):
     validate_true = DistanceMatrix.from_iterable((x for x in range(4)),
                                                  lambda a, b: abs(b - a),
                                                  validate=True)
     validate_false = DistanceMatrix.from_iterable((x for x in range(4)),
                                                   lambda a, b: abs(b - a),
                                                   validate=False)
     self.assertEqual(validate_true, validate_false)
def progressive_msa_and_tree(sequences,
                             pairwise_aligner,
                             metric=kmer_distance,
                             guide_tree=None,
                             display_aln=False,
                             display_tree=False):
    """ Perform progressive msa of sequences and build a UPGMA tree
    Parameters
    ----------
    sequences : skbio.SequenceCollection
        The sequences to be aligned.
    pairwise_aligner : function
        Function that should be used to perform the pairwise alignments,
        for example skbio.alignment.global_pairwise_align_nucleotide. Must
        support skbio.Sequence objects or skbio.TabularMSA objects
        as input.
    metric : function, optional
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects. This will be used to build a guide tree if one
      is not provided.
    guide_tree : skbio.TreeNode, optional
        The tree that should be used to guide the alignment process.
    display_aln : bool, optional
        Print the alignment before returning.
    display_tree : bool, optional
        Print the tree before returning.

    Returns
    -------
    skbio.alignment
    skbio.TreeNode

    """
    if guide_tree is None:
        guide_dm = DistanceMatrix.from_iterable(
                        sequences, metric=metric, key='id')
        guide_lm = average(guide_dm.condensed_form())
        guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids)

    msa = progressive_msa(sequences, guide_tree,
                          pairwise_aligner=pairwise_aligner)

    if display_aln:
        print(msa)

    msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id')
    msa_lm = average(msa_dm.condensed_form())
    msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids)
    if display_tree:
        print("\nOutput tree:")
        d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right',
                   link_color_func=lambda x: 'black', leaf_font_size=24)
    return msa, msa_tree
示例#3
0
def aln_distmat(alignment, reps=3):
    '''Calculate pairwise distances from a MSA of genomes'''
    aln = TabularMSA.read(alignment, constructor=DNA)
    aln.reassign_index(minter="id")
    dist = DistanceMatrix.from_iterable([seq.values for seq in aln],
                                        metric=hamming, keys=aln.index)
    return dist
def guide_tree_from_sequences(sequences,
                              metric=kmer_distance,
                              display_tree = False):
    """ Build a UPGMA tree by applying metric to sequences

    Parameters
    ----------
    sequences : list of skbio.Sequence objects (or subclasses)
      The sequences to be represented in the resulting guide tree.
    metric : function
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects.
    display_tree : bool, optional
      Print the tree before returning.

    Returns
    -------
    skbio.TreeNode

    """
    guide_dm = DistanceMatrix.from_iterable(
                    sequences, metric=metric, key='id')
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
               link_color_func=lambda x: 'black')
    return guide_tree
示例#5
0
    def test_from_iterable_no_key(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix([[0, 1, 2, 3],
                              [1, 0, 1, 2],
                              [2, 1, 0, 1],
                              [3, 2, 1, 0]])
        res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a))
        self.assertEqual(res, exp)
示例#6
0
 def test_from_iterable_validate_false_non_symmetric(self):
     exp = DistanceMatrix([[0, 1, 2, 3],
                           [1, 0, 1, 2],
                           [2, 1, 0, 1],
                           [3, 2, 1, 0]])
     res = DistanceMatrix.from_iterable((x for x in range(4)),
                                        lambda a, b: a - b,
                                        validate=False)
     self.assertEqual(res, exp)
示例#7
0
    def test_from_iterable_no_key(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix([[0, 1, 2, 3],
                              [1, 0, 1, 2],
                              [2, 1, 0, 1],
                              [3, 2, 1, 0]])
        res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a))
        self.assertEqual(res, exp)
示例#8
0
    def test_from_iterable_with_keys(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix(
            [[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]],
            ['0', '1', '4', '9'])
        res = DistanceMatrix.from_iterable(iterable,
                                           lambda a, b: abs(b - a),
                                           keys=iter(['0', '1', '4', '9']))
        self.assertEqual(res, exp)
示例#9
0
    def test_from_iterable_with_keys(self):
        iterable = (x for x in range(4))

        exp = DistanceMatrix([[0, 1, 2, 3],
                              [1, 0, 1, 2],
                              [2, 1, 0, 1],
                              [3, 2, 1, 0]], ['0', '1', '4', '9'])
        res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a),
                                           keys=iter(['0', '1', '4', '9']))
        self.assertEqual(res, exp)
示例#10
0
    def setUp(self):
        np.random.seed(0)
        x = np.random.rand(10)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        ids = np.arange(len(x)).astype(np.str)
        self.tree = TreeNode.from_linkage_matrix(lm, ids)

        # initialize tree with branch length and named internal nodes
        for i, n in enumerate(self.tree.postorder(include_self=True)):
            n.length = 1
            if not n.is_tip():
                n.name = "y%d" % i
示例#11
0
    def setUp(self):
        np.random.seed(0)
        x = np.random.rand(10)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        ids = np.arange(len(x)).astype(np.str)
        self.tree = TreeNode.from_linkage_matrix(lm, ids)

        # initialize tree with branch length and named internal nodes
        for i, n in enumerate(self.tree.postorder(include_self=True)):
            n.length = 1
            if not n.is_tip():
                n.name = "y%d" % i
示例#12
0
def gradient_linkage(X, y, method='average'):
    """
    Principal Balance Analysis using Hierarchical Clustering
    on known gradient.

    The hierarchy is built based on the values of the samples
    located along a gradient.  Given a feature :math:`x`, the mean gradient
    values that :math:`x` was observed in is calculated by

    .. math::
        f(g , x) =
         \sum\limits_{i=1}^N g_i \frac{x_i}{\sum\limits_{j=1}^N x_j}

    Where :math:`N` is the number of samples, :math:`x_i` is the proportion of
    feature :math:`x` in sample :math:`i`, :math:`g_i` is the gradient value
    at sample `i`.

    The distance between two features :math:`x` and :math:`y` can be defined as

    .. math::
        d(x, y) = (f(g, x) - f(g, y))^2

    If :math:`d(x, y)` is very small, then :math:`x` and :math:`y`
    are expected to live in very similar positions across the gradient.
    A hierarchical clustering is  then performed using :math:`d(x, y)` as
    the distance metric.

    Parameters
    ----------
    X : pd.DataFrame
        Contingency table where the samples are rows and the features
        are columns.
    y : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree generated from principal balance analysis.

    See Also
    --------
    mean_niche_estimator
    """
    _X, _y = match(X, y)
    mean_X = mean_niche_estimator(_X, gradient=_y)
    dm = DistanceMatrix.from_iterable(mean_X, euclidean)
    lm = linkage(dm.condensed_form(), method)
    return TreeNode.from_linkage_matrix(lm, X.columns)
示例#13
0
    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)))
        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.tree = SquareDendrogram.from_tree(t)

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3
示例#14
0
def main():
    ids, seqs = [], []
    for line in fileinput.input():
        line = line.rstrip('\r\n')
        if line.startswith('>'):
            ids.append(line[1:])
            seqs.append('')
        else:
            seqs[-1] += line
    mat = DistanceMatrix.from_iterable(seqs,
                                       hamming_no_gap,
                                       keys=ids,
                                       validate=False)
    mat.write(sys.stdout)
示例#15
0
    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)))
        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.tree = SquareDendrogram.from_tree(t)

        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand() * 3
示例#16
0
def rank_linkage(r, method='average'):
    r""" Hierchical Clustering on feature ranks.

    The hierarchy is built based on the rank values of the features given
    an input vector `r` of ranks. The distance between two features :math:`x`
    and :math:`y` can be defined as

    .. math::
       d(x, y) = (r(x) - r(y))^2

    Where :math:`r(x)` is the rank of the features.  Hierarchical clustering is
    then performed using :math:`d(x, y)` as the distance metric.

    This can be useful for constructing principal balances.

    Parameters
    ----------
    r : pd.Series
        Continuous vector representing some ordering of the features in X.
    method : str
        Clustering method.  (default='average')

    Returns
    -------
    skbio.TreeNode
        Tree for constructing principal balances.

    Examples
    --------
    >>> import pandas as pd
    >>> from gneiss.cluster import rank_linkage
    >>> ranks = pd.Series([1, 2, 4, 5],
    ...                   index=['o1', 'o2', 'o3', 'o4'])
    >>> tree = rank_linkage(ranks)
    >>> print(tree.ascii_art())
                        /-o1
              /y1------|
             |          \-o2
    -y0------|
             |          /-o3
              \y2------|
                        \-o4
    """
    dm = DistanceMatrix.from_iterable(r, euclidean)
    lm = linkage(dm.condensed_form(), method)
    t = TreeNode.from_linkage_matrix(lm, r.index)
    t = rename_internal_nodes(t)
    return t
示例#17
0
    def test_cache_ntips(self):
        dm = DistanceMatrix.from_iterable([0, 1, 2, 3],
                                          lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        ids = np.arange(4).astype(np.str)
        t = mock.from_linkage_matrix(lm, ids)

        t._cache_ntips()

        self.assertEqual(t.leafcount, 4)
        self.assertEqual(t.children[0].leafcount, 2)
        self.assertEqual(t.children[1].leafcount, 2)
        self.assertEqual(t.children[0].children[0].leafcount, 1)
        self.assertEqual(t.children[0].children[1].leafcount, 1)
        self.assertEqual(t.children[1].children[0].leafcount, 1)
        self.assertEqual(t.children[1].children[1].leafcount, 1)
示例#18
0
    def test_cache_ntips(self):
        dm = DistanceMatrix.from_iterable([0, 1, 2, 3],
                                          lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        ids = np.arange(4).astype(np.str)
        t = mock.from_linkage_matrix(lm, ids)

        t._cache_ntips()

        self.assertEquals(t.leafcount, 4)
        self.assertEquals(t.children[0].leafcount, 2)
        self.assertEquals(t.children[1].leafcount, 2)
        self.assertEquals(t.children[0].children[0].leafcount, 1)
        self.assertEquals(t.children[0].children[1].leafcount, 1)
        self.assertEquals(t.children[1].children[0].leafcount, 1)
        self.assertEquals(t.children[1].children[1].leafcount, 1)
示例#19
0
def progressive_msa_and_tree(sequences,
                             pairwise_aligner,
                             metric=kmer_distance,
                             guide_tree=None,
                             display_aln=False,
                             display_tree=False):
    """ Perform progressive msa of sequences and build a UPGMA tree
    Parameters
    ----------
    sequences : skbio.SequenceCollection
        The sequences to be aligned.
    pairwise_aligner : function
        Function that should be used to perform the pairwise alignments,
        for example skbio.alignment.global_pairwise_align_nucleotide. Must
        support skbio.Sequence objects or skbio.TabularMSA objects
        as input.
    metric : function, optional
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects. This will be used to build a guide tree if one
      is not provided.
    guide_tree : skbio.TreeNode, optional
        The tree that should be used to guide the alignment process.
    display_aln : bool, optional
        Print the alignment before returning.
    display_tree : bool, optional
        Print the tree before returning.
    Returns
    -------
    skbio.alignment
    skbio.TreeNode
    """
    msa = progressive_msa(sequences, pairwise_aligner=pairwise_aligner,
                          guide_tree=guide_tree)

    if display_aln:
        print(msa)

    msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id')
    msa_lm = sp.cluster.hierarchy.average(msa_dm.condensed_form())
    msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids)
    if display_tree:
        print("\nOutput tree:")
        d = sp.cluster.hierarchy.dendrogram(msa_lm, labels=msa_dm.ids, orientation='right',
                                            link_color_func=lambda x: 'black')
    return msa, msa_tree
示例#20
0
def geodesic_distance(metadata: qiime2.Metadata,
                      latitude: str = 'Latitude',
                      longitude: str = 'Longitude',
                      missing_data: str = 'error') -> DistanceMatrix:
    sample_md = _load_and_validate(
        metadata, [latitude, longitude], ['latitude', 'longitude'],
        missing_data=missing_data)

    # Collect geocoordinate points
    points = [Point(x) for x in zip(sample_md[latitude], sample_md[longitude])]

    # Compute pairwise distances between all points
    def distance_function(a, b):
        return distance.geodesic(a, b).meters

    dm = DistanceMatrix.from_iterable(
        points, metric=distance_function, keys=sample_md.index)

    return dm
示例#21
0
    def test_from_iterable_skbio_hamming_metric_with_metadata(self):
        # test for #1254
        seqs = [
            Sequence('ACGT'),
            Sequence('ACGA', metadata={'id': 'seq1'}),
            Sequence('AAAA', metadata={'id': 'seq2'}),
            Sequence('AAAA', positional_metadata={'qual': range(4)})
        ]

        exp = DistanceMatrix([[0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5],
                              [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]],
                             ['a', 'b', 'c', 'd'])

        dm = DistanceMatrix.from_iterable(
            seqs,
            metric=skbio.sequence.distance.hamming,
            keys=['a', 'b', 'c', 'd'])

        self.assertEqual(dm, exp)
示例#22
0
def hamming_distance_matrix(msa, ignore_sequence_ids=False):
    """Compute Hamming distance matrix of an MSA.

    Parameters
    ----------
    msa : skbio TabularMSA
        Aligned sequences for calculating pairwise Hamming distances
    ignore_sequence_ids : bool
        Default is False. If true, ignore sequence identifier of alignment.
        Useful if identifier got truncated by alignment producing program such
        that different sequences collapse to the same identifier.

    Returns
    -------
    skbio DistanceMatrix
    """
    key = 'id'
    if ignore_sequence_ids:
        key = None
    return DistanceMatrix.from_iterable(msa, hamming, key=key, validate=False)
示例#23
0
文件: util.py 项目: ebolyen/gneiss
def random_tree(n):
    """ Generates a tree with random topology.

    Parameters
    ----------
    n : int
        Number of nodes in the tree

    Returns
    -------
    skbio.TreeNode
        Random tree
    """
    x = np.random.rand(n)
    dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
    lm = ward(dm.condensed_form())
    ids = np.arange(len(x)).astype(np.str)
    t = TreeNode.from_linkage_matrix(lm, ids)
    t = rename_internal_nodes(t)
    return t
示例#24
0
def hamming_distance_matrix(msa, ignore_sequence_ids=False):
    """Compute Hamming distance matrix of an MSA.

    Parameters
    ----------
    msa : skbio TabularMSA
        Aligned sequences for calculating pairwise Hamming distances
    ignore_sequence_ids : bool
        Default is False. If true, ignore sequence identifier of alignment.
        Useful if identifier got truncated by alignment producing program such
        that different sequences collapse to the same identifier.

    Returns
    -------
    skbio DistanceMatrix
    """
    key = 'id'
    if ignore_sequence_ids:
        key = None
    return DistanceMatrix.from_iterable(msa, hamming, key=key, validate=False)
示例#25
0
    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)),
                                  index=['0', '1', '2', '3', '4'],
                                  columns=['0', '1', '2', '3', '4'])

        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.t = SquareDendrogram.from_tree(t)
        self.md = pd.Series(['a', 'a', 'a', 'b', 'b'],
                            index=['0', '1', '2', '3', '4'])
        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand()*3

        self.highlights = pd.DataFrame({'y8': ['#FF0000', '#00FF00'],
                                        'y6': ['#0000FF', '#F0000F']}).T
示例#26
0
    def test_from_iterable_skbio_hamming_metric_with_metadata(self):
        # test for #1254
        seqs = [
            Sequence('ACGT'),
            Sequence('ACGA', metadata={'id': 'seq1'}),
            Sequence('AAAA', metadata={'id': 'seq2'}),
            Sequence('AAAA', positional_metadata={'qual': range(4)})
        ]

        exp = DistanceMatrix([
            [0, 0.25, 0.75, 0.75],
            [0.25, 0.0, 0.5, 0.5],
            [0.75, 0.5, 0.0, 0.0],
            [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd'])

        dm = DistanceMatrix.from_iterable(
            seqs,
            metric=skbio.sequence.distance.hamming,
            keys=['a', 'b', 'c', 'd'])

        self.assertEqual(dm, exp)
示例#27
0
    def setUp(self):
        np.random.seed(0)
        self.table = pd.DataFrame(np.random.random((5, 5)),
                                  index=['0', '1', '2', '3', '4'],
                                  columns=['0', '1', '2', '3', '4'])

        num_otus = 5  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        self.t = SquareDendrogram.from_tree(t)
        self.md = pd.Series(['a', 'a', 'a', 'b', 'b'],
                            index=['0', '1', '2', '3', '4'])
        for i, n in enumerate(t.postorder()):
            if not n.is_tip():
                n.name = "y%d" % i
            n.length = np.random.rand() * 3

        self.highlights = pd.DataFrame({
            'y8': ['#FF0000', '#00FF00'],
            'y6': ['#0000FF', '#F0000F']
        }).T
示例#28
0
def embad(table):
    """
    Calculates the pairwise Earth Mover's distance.

    Assumes that the table is sorted.

    Parameters
    ----------
    table : pd.DataFrame
        Contingency table where the columns are features and the
        rows are samples.

    Returns
    -------
    skbio.DistanceMatrix
        Pairwise distance matrix of Earth Mover's distances
    """

    numsamples, numfeatures = table.shape
    sample_permutation = range(numsamples)

    def emd_dist_matrix(numfeatures):
        D = np.zeros((numfeatures, numfeatures))
        for i in range(numfeatures):
            for j in range(numfeatures):
                D[i, j] = abs(i - j)
        D = D.astype(np.float64)
        return D

    D = emd_dist_matrix(numfeatures)
    distance_metric = partial(emd, distance_matrix=D)
    table_values = table.values.astype(np.float)
    sample_distance = DistanceMatrix.from_iterable(
        np.ascontiguousarray(table_values), distance_metric)
    sample_distance.ids = table.index[sample_permutation]
    return sample_distance
示例#29
0
 def test_from_iterable_empty(self):
     with self.assertRaises(DissimilarityMatrixError):
         DistanceMatrix.from_iterable([], lambda x: x)
示例#30
0
 def test_from_iterable_validate_asym(self):
     iterable = (x for x in range(4))
     with self.assertRaises(DistanceMatrixError):
         DistanceMatrix.from_iterable(iterable, lambda a, b: b - a)
# We can apply this tree style to a random tree as follows.

t = ete3.Tree()
t.populate(10)
t.render("%%inline", tree_style=ts)

# distance based methods to phylogenetic reconstruction

# for the next approach, we will rely on computing the distances between the sequences.

# We will use dissimilarity distance between two objects x and y. Literature on this can be found online, for now
# we are going to show the code.

from skbio import DistanceMatrix

dm = DistanceMatrix([[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]],
                    ids=['a', 'b', 'c'])

_ = dm.plot(cmap='Greens')

# We will use the scikit-bio to create a skbio.distancematrix object. These objects can be viewed as heatmaps.

from BioinformaticsCode.algorithms import kmer_distance

kmer_dm = DistanceMatrix.from_iterable(sequences,
                                       metric=kmer_distance,
                                       key='id')
_ = kmer_dm.plot(cmap='Greens', title='3mer distances between sequences')

kmer_dm.plot
示例#32
0
from skbio import DistanceMatrix
from skbio.tree import nj

distance_matrix = {}
codes_set = set()

fname = sys.argv[1]

for line in open(fname):
    code1, code2, distance = line[0:41].strip(), line[41:81].strip(
    ), 1 - float(line[81:].strip())
    distance_vector = distance_matrix.get(code1, {})
    distance_vector[code2] = distance
    distance_matrix[code1] = distance_vector
    distance_vector = distance_matrix.get(code2, {})
    distance_vector[code1] = distance
    distance_matrix[code2] = distance_vector
    codes_set.add(code1)
    codes_set.add(code2)
    distance_matrix[code1][code1] = 0.0
    distance_matrix[code2][code2] = 0.0

distance_function = lambda x, y: distance_matrix[x][y]
label_function = lambda x: x.replace(' ', '')

dm = DistanceMatrix.from_iterable(codes_set, distance_function, label_function)

tree = nj(dm, True)
print(tree.ascii_art())
示例#33
0
    def test_basic_plot(self):
        self.maxDiff = None
        exp_edges = {'dest_node': ['0', '1', '2', 'y3'],
                     'edge_color': ['#00FF00', '#00FF00',
                                    '#00FF00', '#FF0000'],
                     'edge_width': [2, 2, 2, 2],
                     'src_node': ['y3', 'y4', 'y3', 'y4'],
                     'x0': [338.2612593838583,
                            193.1688862557773,
                            338.2612593838583,
                            193.1688862557773],
                     'x1': [487.5, 12.499999999999972,
                            324.89684138234867, 338.2612593838583],
                     'y0': [271.7282256126416,
                            365.95231443706376,
                            271.7282256126416,
                            365.95231443706376],
                     'y1': [347.7691620070637,
                            483.2800610261029,
                            16.719938973897143,
                            271.7282256126416]}

        exp_nodes = {'child0': [np.nan, np.nan, np.nan, '0', '1'],
                     'child1': [np.nan, np.nan, np.nan, '2', 'y3'],
                     'color': ['#1C9099', '#1C9099', '#1C9099',
                               '#FF999F', '#FF999F'],
                     'hover_var': [None, None, None, None, None],
                     'is_tip': [True, True, True, False, False],
                     'node_size': [10, 10, 10, 10, 10],
                     'x': [487.5,
                           12.499999999999972,
                           324.89684138234867,
                           338.26125938385832,
                           193.16888625577729],
                     'y': [347.7691620070637,
                           483.28006102610289,
                           16.719938973897143,
                           271.72822561264161,
                           365.95231443706376]}
        np.random.seed(0)
        num_otus = 3  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        t = UnrootedDendrogram.from_tree(t)
        # incorporate colors in tree
        for i, n in enumerate(t.postorder(include_self=True)):
            if not n.is_tip():
                n.name = "y%d" % i
                n.color = '#FF999F'
                n.edge_color = '#FF0000'
                n.node_size = 10
            else:
                n.color = '#1C9099'
                n.edge_color = '#00FF00'
                n.node_size = 10
            n.length = np.random.rand()*3
            n.edge_width = 2
        p = radialplot(t, node_color='color', edge_color='edge_color',
                       node_size='node_size', edge_width='edge_width')

        for e in exp_edges.keys():
            self.assertListEqual(
                list(p.renderers[0].data_source.data[e]),
                exp_edges[e])

        for e in exp_nodes.keys():
            self.assertListEqual(
                list(p.renderers[1].data_source.data[e]),
                exp_nodes[e])

        self.assertTrue(isinstance(t, TreeNode))
示例#34
0
 def test_from_iterable_with_key_and_keys(self):
     iterable = (x for x in range(4))
     with self.assertRaises(ValueError):
         DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a),
                                      key=str, keys=['1', '2', '3', '4'])
示例#35
0
 def test_from_iterable_single(self):
     exp = DistanceMatrix([[0]])
     res = DistanceMatrix.from_iterable(["boo"], lambda _: 100)
     self.assertEqual(res, exp)
示例#36
0
 def test_from_iterable_empty(self):
     with self.assertRaises(DissimilarityMatrixError):
         DistanceMatrix.from_iterable([], lambda x: x)
示例#37
0
 def test_from_iterable_validate_asym(self):
     iterable = (x for x in range(4))
     with self.assertRaises(DistanceMatrixError):
         DistanceMatrix.from_iterable(iterable, lambda a, b: b - a)
def progressive_msa(sequences, pairwise_aligner, guide_tree=None,
                    metric=kmer_distance):
    """ Perform progressive msa of sequences

    Parameters
    ----------
    sequences : skbio.SequenceCollection
        The sequences to be aligned.
    metric : function, optional
      Function that returns a single distance value when given a pair of
      skbio.Sequence objects. This will be used to build a guide tree if one
      is not provided.
    guide_tree : skbio.TreeNode, optional
        The tree that should be used to guide the alignment process.
    pairwise_aligner : function
        Function that should be used to perform the pairwise alignments,
        for example skbio.alignment.global_pairwise_align_nucleotide. Must
        support skbio.Sequence objects or skbio.TabularMSA objects
        as input.

    Returns
    -------
    skbio.TabularMSA

    """

    if guide_tree is None:
        guide_dm = DistanceMatrix.from_iterable(
                        sequences, metric=metric, key='id')
        guide_lm = sp.cluster.hierarchy.average(guide_dm.condensed_form())
        guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids)

    seq_lookup = {s.metadata['id']: s for i, s in enumerate(sequences)}
    c1, c2 = guide_tree.children
    if c1.is_tip():
        c1_aln = seq_lookup[c1.name]
    else:
        c1_aln = progressive_msa(sequences, pairwise_aligner, c1)

    if c2.is_tip():
        c2_aln = seq_lookup[c2.name]
    else:
        c2_aln = progressive_msa(sequences, pairwise_aligner, c2)

    alignment, _, _ = pairwise_aligner(c1_aln, c2_aln)
    # this is a temporary hack as the aligners in skbio 0.4.1 are dropping
    # metadata - this makes sure that the right metadata is associated with
    # the sequence after alignment
    if isinstance(c1_aln, Sequence):
        alignment[0].metadata = c1_aln.metadata
        len_c1_aln = 1
    else:
        for i in range(len(c1_aln)):
            alignment[i].metadata = c1_aln[i].metadata
        len_c1_aln = len(c1_aln)
    if isinstance(c2_aln, Sequence):
        alignment[1].metadata = c2_aln.metadata
    else:
        for i in range(len(c2_aln)):
            alignment[len_c1_aln + i].metadata = c2_aln[i].metadata

    return alignment
示例#39
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 20 14:18:58 2016

@author: virginiasaulnier
"""
from io import StringIO
from skbio import DistanceMatrix
from skbio import fisher_alpha

dm_fh =StringIO("\ta\tb\tc\n"
                "a\t0.0\t0.5\t1.0\n"
                "b\t0.5\t0.0\t0.75\n"
                "c\t1.0\t0.75\t0.0\n")
                
dm = DistanceMatrix.read(dm_fh)
print(dm)

my_pairs= StringIO("ac,gt,cg,gc,at,ta,gc,ta,tg")
dm2 = DistanceMatrix.from_iterable(my_pairs,metric= fisher_alpha(),key=id)
示例#40
0
if os.path.isfile(degapped_alignment_fn):
    angio_msa_nogap_noshort = TabularMSA.read(degapped_alignment_fn,
                                              constructor=DNA)
    sys.stderr.write("Read in degapped alignment: {}\n".format(
        angio_msa_nogap_noshort.shape))
else:
    angio_msa_nogap_noshort = get_reduced_alignment(
        "genes/{}/FNA2AA-upp-masked.fasta".format(gene), angio_1kp_ids)

if os.path.isfile(distance_matrix_fn):
    p_dm = DistanceMatrix.read(distance_matrix_fn)
    p_dm_df = p_dm.to_data_frame()
    sys.stderr.write("Read in pre-determined distance matrix!\n")
else:
    p_dm = DistanceMatrix.from_iterable(angio_msa_nogap_noshort,
                                        metric=p_distance,
                                        key="id")
    p_dm_df = p_dm.to_data_frame()
    p_dm_df.to_csv(
        "onekp_only_angios_pdistance/{}_angio_p_dm.csv".format(gene))

# Cluster sequences

divergent_seqs_medoids = []
runs = {}
best_run = len(p_dm_df)
best_run_idx = (6, 0)
for k, i in itertools.product(range(6, 16), range(100)):
    try:
        medoids, membership = kMedoids(p_dm, k)
        medoid_dist = p_dm_df[p_dm_df.ix[medoids].index].apply(min, 1)
示例#41
0
    def test_basic_plot(self):
        self.maxDiff = None
        exp_edges = {
            'dest_node': ['0', '1', '2', 'y3'],
            'edge_color': ['#00FF00', '#00FF00', '#00FF00', '#FF0000'],
            'edge_width': [2, 2, 2, 2],
            'src_node': ['y3', 'y4', 'y3', 'y4'],
            'x0': [
                338.2612593838583, 193.1688862557773, 338.2612593838583,
                193.1688862557773
            ],
            'x1':
            [487.5, 12.499999999999972, 324.89684138234867, 338.2612593838583],
            'y0': [
                271.7282256126416, 365.95231443706376, 271.7282256126416,
                365.95231443706376
            ],
            'y1': [
                347.7691620070637, 483.2800610261029, 16.719938973897143,
                271.7282256126416
            ]
        }

        exp_nodes = {
            'child0': [np.nan, np.nan, np.nan, '0', '1'],
            'child1': [np.nan, np.nan, np.nan, '2', 'y3'],
            'color': ['#1C9099', '#1C9099', '#1C9099', '#FF999F', '#FF999F'],
            'hover_var': [None, None, None, None, None],
            'is_tip': [True, True, True, False, False],
            'node_size': [10, 10, 10, 10, 10],
            'x': [
                12.499999999999972, 487.5, 324.89684138234867,
                338.26125938385832, 193.16888625577729
            ],
            'y': [
                483.28006102610289, 347.7691620070637, 16.719938973897143,
                271.72822561264161, 365.95231443706376
            ]
        }
        np.random.seed(0)
        num_otus = 3  # otus
        x = np.random.rand(num_otus)
        dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x - y))
        lm = ward(dm.condensed_form())
        t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str))
        t = UnrootedDendrogram.from_tree(t)
        # incorporate colors in tree
        for i, n in enumerate(t.postorder(include_self=True)):
            if not n.is_tip():
                n.name = "y%d" % i
                n.color = '#FF999F'
                n.edge_color = '#FF0000'
                n.node_size = 10
            else:
                n.color = '#1C9099'
                n.edge_color = '#00FF00'
                n.node_size = 10
            n.length = np.random.rand() * 3
            n.edge_width = 2
        p = radialplot(t,
                       node_color='color',
                       edge_color='edge_color',
                       node_size='node_size',
                       edge_width='edge_width')

        for e in exp_edges.keys():
            if isinstance(exp_edges[e], float):
                npt.assert_allclose(p.renderers[0].data_source.data[e],
                                    np.array(exp_edges[e]))
            else:
                self.assertListEqual(list(p.renderers[0].data_source.data[e]),
                                     exp_edges[e])

        for e in exp_nodes.keys():
            self.assertListEqual(list(p.renderers[1].data_source.data[e]),
                                 exp_nodes[e])

        self.assertTrue(isinstance(t, TreeNode))
示例#42
0
 def test_from_iterable_with_key_and_keys(self):
     iterable = (x for x in range(4))
     with self.assertRaises(ValueError):
         DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a),
                                      key=str, keys=['1', '2', '3', '4'])
示例#43
0
 def test_from_iterable_single(self):
     exp = DistanceMatrix([[0]])
     res = DistanceMatrix.from_iterable(["boo"], lambda a, b: 0)
     self.assertEqual(res, exp)
示例#44
0
for a in range(len(rows[0])):
    if a > 0:
        this_sample = []
        for b in range(len(rows)):
            if b > 0:
                this_sample.append(float(rows[b][a]))
        samples.append(this_sample)
"""
only_samples = ['LR', 'SR']
new_samples, new_names = [], []
for a in range(len(sample_names)):
    for b in range(len(only_samples)):
        if sample_names[a] == only_samples[b]:
            new_samples.append(samples[a])
            new_names.append(sample_names[a])
samples = new_samples
sample_names = new_names
print(len(samples), len(sample_names))
"""

sam_dm = dm.from_iterable(samples, metric=braycurtis)
pdisp = permdisp(sam_dm,
                 sample_names,
                 column=None,
                 test='median',
                 permutations=999)
print(pdisp)
asim = anosim(sam_dm, sample_names, column=None, permutations=999)
print(asim)
perm = permanova(sam_dm, sample_names, column=None, permutations=999)
print(perm)