示例#1
0
    def setUp(self):
        # Distance matrices with and without ties in the ranks, with 2 groups
        # of equal size.
        dm_ids = ['s1', 's2', 's3', 's4']
        grouping_equal = ['Control', 'Control', 'Fast', 'Fast']

        self.dm_ties = DistanceMatrix(
            [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        self.dm_no_ties = DistanceMatrix(
            [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        # Test with 3 groups of unequal size. This data also generates a
        # negative R statistic.
        grouping_unequal = [
            'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control',
            'Control'
        ]

        self.dm_unequal = DistanceMatrix(
            [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0],
             [1.0, 0.0, 0.002, 0.42, 0.998, 0.0],
             [0.1, 0.002, 0.0, 1.0, 0.123, 1.0],
             [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43],
             [1.0, 0.998, 0.123, 0.123, 0.0, 0.5],
             [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]],
            ['s1', 's2', 's3', 's4', 's5', 's6'])

        self.anosim_ties = ANOSIM(self.dm_ties, grouping_equal)
        self.anosim_no_ties = ANOSIM(self.dm_no_ties, grouping_equal)
        self.anosim_unequal = ANOSIM(self.dm_unequal, grouping_unequal)
示例#2
0
    def setUp(self):
        # Distance matrices with and without ties in the ranks, with 2 groups
        # of equal size.
        dm_ids = ['s1', 's2', 's3', 's4']
        grouping_equal = ['Control', 'Control', 'Fast', 'Fast']

        self.dm_ties = DistanceMatrix(
            [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        self.dm_no_ties = DistanceMatrix(
            [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        # Test with 3 groups of unequal size.
        grouping_unequal = [
            'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control',
            'Control'
        ]

        self.dm_unequal = DistanceMatrix(
            [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0],
             [1.0, 0.0, 0.002, 0.42, 0.998, 0.0],
             [0.1, 0.002, 0.0, 1.0, 0.123, 1.0],
             [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43],
             [1.0, 0.998, 0.123, 0.123, 0.0, 0.5],
             [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]],
            ['s1', 's2', 's3', 's4', 's5', 's6'])

        self.permanova_ties = PERMANOVA(self.dm_ties, grouping_equal)
        self.permanova_no_ties = PERMANOVA(self.dm_no_ties, grouping_equal)
        self.permanova_unequal = PERMANOVA(self.dm_unequal, grouping_unequal)
示例#3
0
    def setUp(self):
        # Distance matrices with and without ties in the ranks, with 2 groups
        # of equal size.
        dm_ids = ['s1', 's2', 's3', 's4']
        grouping_equal = ['Control', 'Control', 'Fast', 'Fast']
        df = pd.read_csv(StringIO(
            'ID,Group\ns2,Control\ns3,Fast\ns4,Fast\ns5,Control\n'
            's1,Control'),
                         index_col=0)

        self.dm_ties = DistanceMatrix(
            [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        self.dm_no_ties = DistanceMatrix(
            [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids)

        # Test with 3 groups of unequal size.
        grouping_unequal = [
            'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control',
            'Control'
        ]

        self.dm_unequal = DistanceMatrix(
            [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0],
             [1.0, 0.0, 0.002, 0.42, 0.998, 0.0],
             [0.1, 0.002, 0.0, 1.0, 0.123, 1.0],
             [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43],
             [1.0, 0.998, 0.123, 0.123, 0.0, 0.5],
             [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]],
            ['s1', 's2', 's3', 's4', 's5', 's6'])

        self.permanova_ties = PERMANOVA(self.dm_ties, grouping_equal)
        self.permanova_no_ties = PERMANOVA(self.dm_no_ties, grouping_equal)
        self.permanova_ties_df = PERMANOVA(self.dm_ties, df, column='Group')
        self.permanova_unequal = PERMANOVA(self.dm_unequal, grouping_unequal)
示例#4
0
    def test_init_invalid_input(self):
        """Raises error on invalid distance matrix data / IDs."""
        # Asymmetric.
        data = [[0.0, 2.0], [1.0, 0.0]]
        with self.assertRaises(DistanceMatrixError):
            _ = DistanceMatrix(data, ['a', 'b'])

        # Ensure that the superclass validation is still being performed.
        with self.assertRaises(DissimilarityMatrixError):
            _ = DistanceMatrix([[1, 2, 3]], ['a'])
示例#5
0
    def test_distance_matrix_instances_as_input(self):
        # IDs shouldn't matter -- the function should only care about the
        # matrix data
        dmx = DistanceMatrix(self.minx)
        dmy = DistanceMatrix(self.miny, ['no', 'cog', 'yay'])

        np.random.seed(0)

        obs = mantel(dmx, dmy, alternative='less')

        self.assertAlmostEqual(obs[0], self.exp_x_vs_y)
        self.assertAlmostEqual(obs[1], 0.843)
示例#6
0
    def setUp(self):
        super(DistanceMatrixTests, self).setUp()

        self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a'])
        self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b'])
        self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c'])

        self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3]
        self.dm_condensed_forms = [
            np.array([]),
            np.array([0.123]),
            np.array([0.01, 4.2, 12.0])
        ]
示例#7
0
    def test_compute_q(self):
        expected_data = [[0, -50, -38, -34, -34], [-50, 0, -38, -34, -34],
                         [-38, -38, 0, -40, -40], [-34, -34, -40, 0, -48],
                         [-34, -34, -40, -48, 0]]
        expected_ids = list('abcde')
        expected = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(_compute_q(self.dm1), expected)

        data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]]
        dm = DistanceMatrix(data, list('abc'))
        # computed this manually
        expected_data = [[0, -8, -8], [-8, 0, -8], [-8, -8, 0]]
        expected = DistanceMatrix(expected_data, list('abc'))
        self.assertEqual(_compute_q(dm), expected)
示例#8
0
    def test_compute_collapsed_dm(self):
        expected_data = [[0, 7, 7, 6], [7, 0, 8, 7], [7, 8, 0, 3],
                         [6, 7, 3, 0]]
        expected_ids = ['x', 'c', 'd', 'e']
        expected1 = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(_compute_collapsed_dm(self.dm1, 'a', 'b', True, 'x'),
                         expected1)

        # computed manually
        expected_data = [[0, 4, 3], [4, 0, 3], [3, 3, 0]]
        expected_ids = ['yy', 'd', 'e']
        expected2 = DistanceMatrix(expected_data, expected_ids)
        self.assertEqual(
            _compute_collapsed_dm(expected1, 'x', 'c', True, 'yy'), expected2)
示例#9
0
 def test_distances(self):
     """distances functions as expected
     """
     expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13],
                 [4. / 13, 7. / 13, 0]]
     expected = DistanceMatrix(expected, ['d1', 'd2', 'd3'])
     actual = self.a1.distances()
     self.assertEqual(actual, expected)
示例#10
0
def compute_aligned_sequence_distances(seqs, distance_fn=hamming_distance):
    dm = []
    ids = []
    for id1, seq1 in seqs:
        ids.append(id1)
        row = []
        for id2, seq2 in seqs:
            row.append(hamming_distance(seq1, seq2))
        dm.append(row)
    return DistanceMatrix(dm, ids)
示例#11
0
    def test_permute_not_condensed(self):
        obs = self.dm_1x1.permute()
        self.assertEqual(obs, self.dm_1x1)
        self.assertFalse(obs is self.dm_1x1)

        obs = self.dm_2x2.permute()
        self.assertEqual(obs, self.dm_2x2)
        self.assertFalse(obs is self.dm_2x2)

        np.random.seed(0)

        exp = DistanceMatrix([[0, 12, 4.2], [12, 0, 0.01], [4.2, 0.01, 0]],
                             self.dm_3x3.ids)
        obs = self.dm_3x3.permute()
        self.assertEqual(obs, exp)

        exp = DistanceMatrix([[0, 4.2, 12], [4.2, 0, 0.01], [12, 0.01, 0]],
                             self.dm_3x3.ids)
        obs = self.dm_3x3.permute()
        self.assertEqual(obs, exp)
示例#12
0
    def test_random_fn(self):
        """Test passing a different random function than the default."""
        def myrand(num_rows, num_cols):
            # One dm to rule them all...
            data = np.empty((num_rows, num_cols))
            data.fill(42)
            return data

        exp = DistanceMatrix(
            np.asarray([[0, 42, 42], [42, 0, 42], [42, 42, 0]]),
            ['1', '2', '3'])
        obs = randdm(3, random_fn=myrand)
        self.assertEqual(obs, exp)
示例#13
0
    def test_init_from_dm(self):
        """Constructs a dm from a dm."""
        ids = ['foo', 'bar', 'baz']

        # DissimilarityMatrix -> DissimilarityMatrix
        exp = DissimilarityMatrix(self.dm_3x3_data, ids)
        obs = DissimilarityMatrix(self.dm_3x3, ids)
        self.assertEqual(obs, exp)
        # Test that copy of data is not made.
        self.assertTrue(obs.data is self.dm_3x3.data)
        obs.data[0, 1] = 424242
        self.assertTrue(np.array_equal(obs.data, self.dm_3x3.data))

        # DistanceMatrix -> DissimilarityMatrix
        exp = DissimilarityMatrix(self.dm_3x3_data, ids)
        obs = DissimilarityMatrix(
            DistanceMatrix(self.dm_3x3_data, ('a', 'b', 'c')), ids)
        self.assertEqual(obs, exp)

        # DissimilarityMatrix -> DistanceMatrix
        with self.assertRaises(DistanceMatrixError):
            _ = DistanceMatrix(self.dm_2x2_asym, ['foo', 'bar'])
示例#14
0
    def setUp(self):
        data1 = [[0, 5, 9, 9, 8], [5, 0, 10, 10, 9], [9, 10, 0, 8, 7],
                 [9, 10, 8, 0, 3], [8, 9, 7, 3, 0]]
        ids1 = list('abcde')
        self.dm1 = DistanceMatrix(data1, ids1)
        # this newick string was confirmed against http://www.trex.uqam.ca/
        # which generated the following (isomorphic) newick string:
        # (d:2.0000,e:1.0000,(c:4.0000,(a:2.0000,b:3.0000):3.0000):2.0000);
        self.expected1_str = ("(d:2.000000, (c:4.000000, (b:3.000000,"
                              " a:2.000000):3.000000):2.000000, e:1.000000);")
        self.expected1_TreeNode = TreeNode.from_newick(self.expected1_str)

        # this example was pulled from the Phylip manual
        # http://evolution.genetics.washington.edu/phylip/doc/neighbor.html
        data2 = [[0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905],
                 [1.6866, 0.0000, 1.5232, 1.4841, 1.4465, 1.4389, 1.4629],
                 [1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583],
                 [1.6606, 1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710],
                 [1.5243, 1.4465, 0.5958, 0.4631, 0.0000, 0.3484, 0.3083],
                 [1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692],
                 [1.5905, 1.4629, 0.5583, 0.4710, 0.3083, 0.2692, 0.0000]]
        ids2 = [
            "Bovine", "Mouse", "Gibbon", "Orang", "Gorilla", "Chimp", "Human"
        ]
        self.dm2 = DistanceMatrix(data2, ids2)
        self.expected2_str = ("(Mouse:0.76891, (Gibbon:0.35793, (Orang:0.28469"
                              ", (Gorilla:0.15393, (Chimp:0.15167, Human:0.117"
                              "53):0.03982):0.02696):0.04648):0.42027, Bovine:"
                              "0.91769);")
        self.expected2_TreeNode = TreeNode.from_newick(self.expected2_str)

        data3 = [[0, 5, 4, 7, 6, 8], [5, 0, 7, 10, 9, 11], [4, 7, 0, 7, 6, 8],
                 [7, 10, 7, 0, 5, 8], [6, 9, 6, 5, 0, 8], [8, 11, 8, 8, 8, 0]]
        ids3 = map(str, range(6))
        self.dm3 = DistanceMatrix(data3, ids3)
        self.expected3_str = ("((((0:1.000000,1:4.000000):1.000000,2:2.000000"
                              "):1.250000,5:4.750000):0.750000,3:2.750000,4:2."
                              "250000);")
        self.expected3_TreeNode = TreeNode.from_newick(self.expected3_str)
示例#15
0
    def test_tip_tip_distances_endpoints(self):
        """Test getting specifc tip distances  with tipToTipDistances"""
        t = TreeNode.from_newick('((H:1,G:1):2,(R:0.5,M:0.7):3);')
        nodes = [t.find('H'), t.find('G'), t.find('M')]
        names = ['H', 'G', 'M']
        exp = DistanceMatrix(np.array([[0, 2.0, 6.7],
                                       [2.0, 0, 6.7],
                                       [6.7, 6.7, 0.0]]), ['H', 'G', 'M'])

        obs = t.tip_tip_distances(endpoints=names)
        self.assertEqual(obs, exp)

        obs = t.tip_tip_distances(endpoints=nodes)
        self.assertEqual(obs, exp)
示例#16
0
def get_clusters(x_original, axis=['row', 'column'][0]):
    """Performs UPGMA clustering using euclidean distances"""
    x = x_original.copy()
    if axis == 'column':
        x = x.T
    nr = x.shape[0]
    metric_f = get_nonphylogenetic_metric('euclidean')
    row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr)))
    # do upgma - rows
    # Average in SciPy's cluster.heirarchy.linkage is UPGMA
    linkage_matrix = linkage(row_dissims.condensed_form(), method='average')
    tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids)
    row_order = [int(tip.name) for tip in tree.tips()]
    return row_order
示例#17
0
    def distances(self):
        """Compute distances between all pairs of sequences

        Returns
        -------
        skbio.core.distance.DistanceMatrix
            Matrix containing the distances between all pairs of sequences.

        Raises
        ------
        skbio.core.exception.BiologicalSequenceError
            If ``len(self) != len(other)``.

        See Also
        --------
        skbio.core.distance.DistanceMatrix
        scipy.spatial.distance.hamming

        Notes
        -----
        Distances between sequences are computed as hamming distances, though
        this will be generalized (see #194).

        Examples
        --------
        >>> from skbio.core.alignment import Alignment
        >>> from skbio.core.sequence import DNA
        >>> seqs = [DNA("A-CCGGG", identifier="s1"),
        ...         DNA("ATCC--G", identifier="s2"),
        ...         DNA("ATCCGGA", identifier="s3")]
        >>> a1 = Alignment(seqs)
        >>> print a1.distances()
        3x3 distance matrix
        IDs:
        s1, s2, s3
        Data:
        [[ 0.          0.42857143  0.28571429]
         [ 0.42857143  0.          0.42857143]
         [ 0.28571429  0.42857143  0.        ]]
        """
        sequence_count = self.sequence_count()
        dm = np.zeros((sequence_count, sequence_count))
        identifiers = []
        for i in xrange(sequence_count):
            self_i = self[i]
            identifiers.append(self_i.identifier)
            for j in xrange(i):
                dm[i, j] = dm[j, i] = self_i.distance(self[j])
        return DistanceMatrix(dm, identifiers)
示例#18
0
 def setUp(self):
     self.dm = DistanceMatrix(
         [[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]],
         ['a', 'b', 'c'])
     self.grouping = [1, 2, 1]
     # Ordering of IDs shouldn't matter, nor should extra IDs.
     self.df = pd.read_csv(
         StringIO('ID,Group\nb,Group1\na,Group2\nc,Group1\nd,Group3'),
         index_col=0)
     self.df_missing_id = pd.read_csv(
         StringIO('ID,Group\nb,Group1\nc,Group1'), index_col=0)
     self.categorical_stats = CategoricalStats(self.dm, self.grouping)
     self.categorical_stats_from_df = CategoricalStats(self.dm,
                                                       self.df,
                                                       column='Group')
示例#19
0
def guide_tree_from_query_sequences(query_sequences, 
                                    distance_fn=three_mer_distance,
                                    display_tree = False):
    guide_dm = []
    seq_ids = []
    for seq_id1, seq1 in query_sequences:
        seq_ids.append(seq_id1)
        row = []
        for seq_id2, seq2 in query_sequences:
            row.append(kmer_distance(seq1, seq2, k=3))
        guide_dm.append(row)
    
    guide_dm = DistanceMatrix(guide_dm, seq_ids)
    guide_lm = average(guide_dm.condensed_form())
    guide_tree = to_tree(guide_lm)
    if display_tree:
        guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 
               link_color_func=lambda x: 'black')
    return guide_tree
示例#20
0
    def test_default_usage(self):
        """Test generating random distance matrices."""
        exp = DistanceMatrix(np.asarray([[0.0]]), ['1'])
        obs = randdm(1)
        self.assertEqual(obs, exp)

        obs = randdm(2)
        self.assertEqual(obs.shape, (2, 2))
        self.assertEqual(obs.ids, ('1', '2'))

        obs1 = randdm(5)
        num_trials = 10
        found_diff = False
        for _ in range(num_trials):
            obs2 = randdm(5)

            if obs1 != obs2:
                found_diff = True
                break

        self.assertTrue(found_diff)
示例#21
0
 def test_nj_trivial(self):
     data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]]
     dm = DistanceMatrix(data, list('abc'))
     expected_str = "(b:2.000000, a:1.000000, c:1.000000);"
     self.assertEqual(nj(dm, result_constructor=str), expected_str)
示例#22
0
文件: test_base.py 项目: yao1314/bipy
 def setUp(self):
     self.dm = DistanceMatrix(
         [[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]],
         ['a', 'b', 'c'])
     self.categorical_stats = CategoricalStats(self.dm, [1, 2, 1])
示例#23
0
def mantel(x, y, method='pearson', permutations=999, alternative='two-sided'):
    """Compute correlation between distance matrices using the Mantel test.

    The Mantel test compares two distance matrices by computing the correlation
    between the distances in the lower (or upper) triangular portions of the
    symmetric distance matrices. Correlation can be computed using Pearson's
    product-moment correlation coefficient or Spearman's rank correlation
    coefficient.

    As defined in [1]_, the Mantel test computes a test statistic :math:`r_M`
    given two symmetric distance matrices :math:`D_X` and :math:`D_Y`.
    :math:`r_M` is defined as

    .. math::

       r_M=\\frac{1}{d-1}\\sum_{i=1}^{n-1}\\sum_{j=i+1}^{n}
       stand(D_X)_{ij}stand(D_Y)_{ij}

    where

    .. math::

       d=\\frac{n(n-1)}{2}

    and :math:`n` is the number of rows/columns in each of the distance
    matrices. :math:`stand(D_X)` and :math:`stand(D_Y)` are distance matrices
    with their upper triangles containing standardized distances. Note that
    since :math:`D_X` and :math:`D_Y` are symmetric, the lower triangular
    portions of the matrices could equivalently have been used instead of the
    upper triangular portions (the current function behaves in this manner).

    If ``method='spearman'``, the above equation operates on ranked distances
    instead of the original distances.

    Statistical significance is assessed via a permutation test. The rows and
    columns of the first distance matrix (`x`) are randomly permuted a
    number of times (controlled via `permutations`). A correlation coefficient
    is computed for each permutation and the p-value is the proportion of
    permuted correlation coefficients that are equal to or more extreme
    than the original (unpermuted) correlation coefficient. Whether a permuted
    correlation coefficient is "more extreme" than the original correlation
    coefficient depends on the alternative hypothesis (controlled via
    `alternative`).

    Parameters
    ----------
    x, y : array_like or DistanceMatrix
        Input distance matrices to compare. Both matrices must have the same
        shape and be at least 3x3 in size. If ``array_like``, will be cast to
        ``DistanceMatrix`` (thus the requirements of a valid ``DistanceMatrix``
        apply to both `x` and `y`, such as symmetry and hollowness). If inputs
        are already ``DistanceMatrix`` instances, the IDs do not need to match
        between them; they are assumed to both be in the same order regardless
        of their IDs (the underlying data matrix is the only thing considered
        by this function).
    method : {'pearson', 'spearman'}
        Method used to compute the correlation between distance matrices.
    permutations : int, optional
        Number of times to randomly permute `x` when assessing statistical
        significance. Must be greater than or equal to zero. If zero,
        statistical significance calculations will be skipped and the p-value
        will be ``np.nan``.
    alternative : {'two-sided', 'greater', 'less'}
        Alternative hypothesis to use when calculating statistical
        significance. The default ``'two-sided'`` alternative hypothesis
        calculates the proportion of permuted correlation coefficients whose
        magnitude (i.e. after taking the absolute value) is greater than or
        equal to the absolute value of the original correlation coefficient.
        ``'greater'`` calculates the proportion of permuted coefficients that
        are greater than or equal to the original coefficient. ``'less'``
        calculates the proportion of permuted coefficients that are less than
        or equal to the original coefficient.

    Returns
    -------
    tuple of floats
        Correlation coefficient and p-value of the test.

    Raises
    ------
    ValueError
        If `x` and `y` are not the same shape and at least 3x3 in size, or an
        invalid `method`, number of `permutations`, or `alternative` are
        provided.

    See Also
    --------
    DistanceMatrix
    scipy.stats.pearsonr
    scipy.stats.spearmanr

    Notes
    -----
    The Mantel test was first described in [2]_. The general algorithm and
    interface are similar to ``vegan::mantel``, available in R's vegan
    package [3]_.

    ``np.nan`` will be returned for the p-value if `permutations` is zero or if
    the correlation coefficient is ``np.nan``. The correlation coefficient will
    be ``np.nan`` if one or both of the inputs does not have any variation
    (i.e. the distances are all constant) and ``method='spearman'``.

    References
    ----------
    .. [1] Legendre, P. and Legendre, L. (2012) Numerical Ecology. 3rd English
       Edition. Elsevier.

    .. [2] Mantel, N. (1967). "The detection of disease clustering and a
       generalized regression approach". Cancer Research 27 (2): 209-220. PMID
       6018555.

    .. [3] http://cran.r-project.org/web/packages/vegan/index.html

    Examples
    --------
    Define two 3x3 distance matrices:

    >>> x = [[0, 1, 2],
    ...      [1, 0, 3],
    ...      [2, 3, 0]]
    >>> y = [[0, 2, 7],
    ...      [2, 0, 6],
    ...      [7, 6, 0]]

    Compute the Pearson correlation between them and assess significance using
    a two-sided test with 999 permutations:

    >>> coeff, p_value = mantel(x, y)
    >>> round(coeff, 4)
    0.7559

    Thus, we see a moderate-to-strong positive correlation (:math:`r_M=0.7559`)
    between the two matrices.

    """
    if method == 'pearson':
        corr_func = pearsonr
    elif method == 'spearman':
        corr_func = spearmanr
    else:
        raise ValueError("Invalid correlation method '%s'." % method)

    if permutations < 0:
        raise ValueError("Number of permutations must be greater than or "
                         "equal to zero.")
    if alternative not in ('two-sided', 'greater', 'less'):
        raise ValueError("Invalid alternative hypothesis '%s'." % alternative)

    x = DistanceMatrix(x)
    y = DistanceMatrix(y)

    if x.shape != y.shape:
        raise ValueError("Distance matrices must have the same shape.")
    if x.shape[0] < 3:
        raise ValueError("Distance matrices must be at least 3x3 in size.")

    x_flat = x.condensed_form()
    y_flat = y.condensed_form()

    orig_stat = corr_func(x_flat, y_flat)[0]

    if permutations == 0 or np.isnan(orig_stat):
        p_value = np.nan
    else:
        perm_gen = (corr_func(x.permute(condensed=True), y_flat)[0]
                    for _ in range(permutations))
        permuted_stats = np.fromiter(perm_gen, np.float, count=permutations)

        if alternative == 'two-sided':
            count_better = (np.absolute(permuted_stats) >=
                            np.absolute(orig_stat)).sum()
        elif alternative == 'greater':
            count_better = (permuted_stats >= orig_stat).sum()
        else:
            count_better = (permuted_stats <= orig_stat).sum()

        p_value = (count_better + 1) / (permutations + 1)

    return orig_stat, p_value
示例#24
0
 def setup(self):
     """Sample data set from page 111 of W.J Krzanowski. Principles
     of multivariate analysis, 2000, Oxford University Press."""
     matrix = np.loadtxt(get_data_path('PCoA_sample_data'))
     dist_matrix = DistanceMatrix(matrix, map(str, range(matrix.shape[0])))
     self.dist_matrix = dist_matrix
示例#25
0
 def setup(self):
     matrix = np.loadtxt(get_data_path('PCoA_sample_data_2'))
     self.ids = [str(i) for i in range(matrix.shape[0])]
     dist_matrix = DistanceMatrix(matrix, self.ids)
     self.ordination = PCoA(dist_matrix)
示例#26
0
def run_mantel_test(method,
                    fps,
                    distmats,
                    num_perms,
                    tail_type,
                    comment,
                    control_dm_fp=None,
                    control_dm=None,
                    sample_id_map=None):
    """Runs a Mantel test on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        method - which Mantel test to run (either 'mantel' or 'partial_mantel')
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        tail_type - the type of tail test to use when calculating the
            p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies
            when method is mantel
        comment - comment string to add to the beginning of the results string
        control_dm_fp - filepath of the control distance matrix. Only applies
            when method is partial_mantel (it is required then)
        control_dm - tuple containing control distance matrix labels and matrix
            data. Only applies when method is partial_mantel (it is required
            then)
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment

    if method == 'mantel':
        result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \
                  'p-value\tNumber of permutations\tTail type\n'
    elif method == 'partial_mantel':
        if not control_dm_fp or not control_dm:
            raise ValueError("You must provide a control matrix filepath and "
                             "control matrix when running the partial Mantel "
                             "test.")
        result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \
            'Mantel r statistic\tp-value\tNumber of permutations\t' +\
            'Tail type\n'
    else:
        raise ValueError("Invalid method '%s'. Must be either 'mantel' or "
                         "'partial_mantel'." % method)

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if method == 'partial_mantel':
                # We need to intersect three sets (three matrices).
                (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), control_dm,
                        lookup=sample_id_map)
                (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                    make_compatible_distance_matrices(
                        (dm1_labels, dm1_data), (dm2_labels, dm2_data),
                        lookup=sample_id_map)
                if len(dm1_labels) < 3:
                    result += '%s\t%s\t%s\t%d\tToo few samples\n' % (
                        fp1, fp2, control_dm_fp, len(dm1_labels))
                    continue
            elif len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = DistanceMatrix(dm1_data, dm1_labels)
            dm2 = DistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our correlation test and run it with
            # the specified number of permutations.
            if method == 'mantel':
                results = Mantel(dm1, dm2, tail_type)(num_perms)
                p_str = format_p_value_for_num_iters(results['p_value'],
                                                     num_perms)
                result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (
                    fp1, fp2, len(dm1_labels), results['r_value'], p_str,
                    num_perms, tail_type)
            elif method == 'partial_mantel':
                cdm = DistanceMatrix(cdm_data, cdm_labels)
                results = PartialMantel(dm1, dm2, cdm)(num_perms)
                p_str = format_p_value_for_num_iters(results['mantel_p'],
                                                     num_perms)
                result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % (
                    fp1, fp2, control_dm_fp, len(dm1_labels),
                    results['mantel_r'], p_str, num_perms, 'greater')
    return result
示例#27
0
def run_mantel_correlogram(fps,
                           distmats,
                           num_perms,
                           comment,
                           alpha,
                           sample_id_map=None,
                           variable_size_distance_classes=False):
    """Runs a Mantel correlogram analysis on all pairs of distance matrices.

    Returns a string suitable for writing out to a file containing the results
    of the test, a list of correlogram filepath names, and a list of matplotlib
    Figure objects representing each correlogram.

    The correlogram filepaths can have an extension string appended to the end
    of them and then be used to save each of the correlogram Figures to a file.
    Each correlogram filepath will be a combination of the two distance matrix
    filepaths that were used to create it.

    WARNING: Only symmetric, hollow distance matrices may be used as input.
    Asymmetric distance matrices, such as those obtained by the UniFrac Gain
    metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input.

    Arguments:
        fps - list of filepaths of the distance matrices
        distmats - list of tuples containing dm labels and dm data (i.e. the
            output of parse_distmat)
        num_perms - the number of permutations to use to calculate the
            p-value(s)
        comment - comment string to add to the beginning of the results string
        alpha - the alpha value to use to determine significance in the
            correlogram plots
        sample_id_map - dict mapping sample IDs (i.e. what is expected by
            make_compatible_distance_matrices)
        variable_size_distance_classes - create distance classes that vary in
            size (i.e. width) but have the same number of distances in each
            class
    """
    if len(fps) != len(distmats):
        raise ValueError("Must provide the same number of filepaths as there "
                         "are distance matrices.")
    if comment is None:
        comment = ''
    result = comment + 'DM1\tDM2\tNumber of entries\t' + \
                       'Number of permutations\tClass index\t' + \
                       'Number of distances\tMantel r statistic\t' + \
                       'p-value\tp-value (Bonferroni corrected)\tTail type\n'
    correlogram_fps = []
    correlograms = []

    # Loop over all pairs of dms.
    for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)):
        for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]:
            # Make the current pair of distance matrices compatible by only
            # keeping samples that match between them, and ordering them by
            # the same sample IDs.
            (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \
                make_compatible_distance_matrices((dm1_labels, dm1_data),
                                                  (dm2_labels, dm2_data), lookup=sample_id_map)
            if len(dm1_labels) < 3:
                result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2,
                                                             len(dm1_labels))
                continue

            dm1 = DistanceMatrix(dm1_data, dm1_labels)
            dm2 = DistanceMatrix(dm2_data, dm2_labels)

            # Create an instance of our Mantel correlogram test and run it with
            # the specified number of permutations.
            mc = MantelCorrelogram(
                dm1,
                dm2,
                alpha=alpha,
                variable_size_distance_classes=variable_size_distance_classes)
            results = mc(num_perms)

            # Generate a name for the current correlogram and save it and the
            # correlogram itself.
            dm1_name = path.basename(fp1)
            dm2_name = path.basename(fp2)
            correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name,
                                             'mantel_correlogram')) + '.')
            correlograms.append(results['correlogram_plot'])

            # Iterate over the results and write them to the text file.
            first_time = True
            for class_idx, num_dist, r, p, p_corr in zip(
                    results['class_index'], results['num_dist'],
                    results['mantel_r'], results['mantel_p'],
                    results['mantel_p_corr']):
                # Format p-values and figure out which tail type we have based
                # on the sign of r.
                p_str = None
                if p is not None:
                    p_str = format_p_value_for_num_iters(p, num_perms)
                p_corr_str = None
                if p_corr is not None:
                    p_corr_str = format_p_value_for_num_iters(
                        p_corr, num_perms)
                if r is None:
                    tail_type = None
                elif r < 0:
                    tail_type = 'less'
                else:
                    tail_type = 'greater'

                if first_time:
                    result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        fp1, fp2, len(dm1_labels), num_perms, class_idx,
                        num_dist, r, p_str, p_corr_str, tail_type)
                    first_time = False
                else:
                    result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % (
                        class_idx, num_dist, r, p_str, p_corr_str, tail_type)
    return result, correlogram_fps, correlograms
示例#28
0
 def test_nj_error(self):
     data = [[0, 3], [3, 0]]
     dm = DistanceMatrix(data, list('ab'))
     self.assertRaises(ValueError, nj, dm)