def setUp(self): # Distance matrices with and without ties in the ranks, with 2 groups # of equal size. dm_ids = ['s1', 's2', 's3', 's4'] grouping_equal = ['Control', 'Control', 'Fast', 'Fast'] self.dm_ties = DistanceMatrix( [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids) self.dm_no_ties = DistanceMatrix( [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids) # Test with 3 groups of unequal size. This data also generates a # negative R statistic. grouping_unequal = [ 'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control', 'Control' ] self.dm_unequal = DistanceMatrix( [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0], [1.0, 0.0, 0.002, 0.42, 0.998, 0.0], [0.1, 0.002, 0.0, 1.0, 0.123, 1.0], [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43], [1.0, 0.998, 0.123, 0.123, 0.0, 0.5], [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]], ['s1', 's2', 's3', 's4', 's5', 's6']) self.anosim_ties = ANOSIM(self.dm_ties, grouping_equal) self.anosim_no_ties = ANOSIM(self.dm_no_ties, grouping_equal) self.anosim_unequal = ANOSIM(self.dm_unequal, grouping_unequal)
def setUp(self): # Distance matrices with and without ties in the ranks, with 2 groups # of equal size. dm_ids = ['s1', 's2', 's3', 's4'] grouping_equal = ['Control', 'Control', 'Fast', 'Fast'] self.dm_ties = DistanceMatrix( [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids) self.dm_no_ties = DistanceMatrix( [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids) # Test with 3 groups of unequal size. grouping_unequal = [ 'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control', 'Control' ] self.dm_unequal = DistanceMatrix( [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0], [1.0, 0.0, 0.002, 0.42, 0.998, 0.0], [0.1, 0.002, 0.0, 1.0, 0.123, 1.0], [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43], [1.0, 0.998, 0.123, 0.123, 0.0, 0.5], [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]], ['s1', 's2', 's3', 's4', 's5', 's6']) self.permanova_ties = PERMANOVA(self.dm_ties, grouping_equal) self.permanova_no_ties = PERMANOVA(self.dm_no_ties, grouping_equal) self.permanova_unequal = PERMANOVA(self.dm_unequal, grouping_unequal)
def setUp(self): # Distance matrices with and without ties in the ranks, with 2 groups # of equal size. dm_ids = ['s1', 's2', 's3', 's4'] grouping_equal = ['Control', 'Control', 'Fast', 'Fast'] df = pd.read_csv(StringIO( 'ID,Group\ns2,Control\ns3,Fast\ns4,Fast\ns5,Control\n' 's1,Control'), index_col=0) self.dm_ties = DistanceMatrix( [[0, 1, 1, 4], [1, 0, 3, 2], [1, 3, 0, 3], [4, 2, 3, 0]], dm_ids) self.dm_no_ties = DistanceMatrix( [[0, 1, 5, 4], [1, 0, 3, 2], [5, 3, 0, 3], [4, 2, 3, 0]], dm_ids) # Test with 3 groups of unequal size. grouping_unequal = [ 'Control', 'Treatment1', 'Treatment2', 'Treatment1', 'Control', 'Control' ] self.dm_unequal = DistanceMatrix( [[0.0, 1.0, 0.1, 0.5678, 1.0, 1.0], [1.0, 0.0, 0.002, 0.42, 0.998, 0.0], [0.1, 0.002, 0.0, 1.0, 0.123, 1.0], [0.5678, 0.42, 1.0, 0.0, 0.123, 0.43], [1.0, 0.998, 0.123, 0.123, 0.0, 0.5], [1.0, 0.0, 1.0, 0.43, 0.5, 0.0]], ['s1', 's2', 's3', 's4', 's5', 's6']) self.permanova_ties = PERMANOVA(self.dm_ties, grouping_equal) self.permanova_no_ties = PERMANOVA(self.dm_no_ties, grouping_equal) self.permanova_ties_df = PERMANOVA(self.dm_ties, df, column='Group') self.permanova_unequal = PERMANOVA(self.dm_unequal, grouping_unequal)
def test_init_invalid_input(self): """Raises error on invalid distance matrix data / IDs.""" # Asymmetric. data = [[0.0, 2.0], [1.0, 0.0]] with self.assertRaises(DistanceMatrixError): _ = DistanceMatrix(data, ['a', 'b']) # Ensure that the superclass validation is still being performed. with self.assertRaises(DissimilarityMatrixError): _ = DistanceMatrix([[1, 2, 3]], ['a'])
def test_distance_matrix_instances_as_input(self): # IDs shouldn't matter -- the function should only care about the # matrix data dmx = DistanceMatrix(self.minx) dmy = DistanceMatrix(self.miny, ['no', 'cog', 'yay']) np.random.seed(0) obs = mantel(dmx, dmy, alternative='less') self.assertAlmostEqual(obs[0], self.exp_x_vs_y) self.assertAlmostEqual(obs[1], 0.843)
def setUp(self): super(DistanceMatrixTests, self).setUp() self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a']) self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b']) self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c']) self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3] self.dm_condensed_forms = [ np.array([]), np.array([0.123]), np.array([0.01, 4.2, 12.0]) ]
def test_compute_q(self): expected_data = [[0, -50, -38, -34, -34], [-50, 0, -38, -34, -34], [-38, -38, 0, -40, -40], [-34, -34, -40, 0, -48], [-34, -34, -40, -48, 0]] expected_ids = list('abcde') expected = DistanceMatrix(expected_data, expected_ids) self.assertEqual(_compute_q(self.dm1), expected) data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]] dm = DistanceMatrix(data, list('abc')) # computed this manually expected_data = [[0, -8, -8], [-8, 0, -8], [-8, -8, 0]] expected = DistanceMatrix(expected_data, list('abc')) self.assertEqual(_compute_q(dm), expected)
def test_compute_collapsed_dm(self): expected_data = [[0, 7, 7, 6], [7, 0, 8, 7], [7, 8, 0, 3], [6, 7, 3, 0]] expected_ids = ['x', 'c', 'd', 'e'] expected1 = DistanceMatrix(expected_data, expected_ids) self.assertEqual(_compute_collapsed_dm(self.dm1, 'a', 'b', True, 'x'), expected1) # computed manually expected_data = [[0, 4, 3], [4, 0, 3], [3, 3, 0]] expected_ids = ['yy', 'd', 'e'] expected2 = DistanceMatrix(expected_data, expected_ids) self.assertEqual( _compute_collapsed_dm(expected1, 'x', 'c', True, 'yy'), expected2)
def test_distances(self): """distances functions as expected """ expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected)
def compute_aligned_sequence_distances(seqs, distance_fn=hamming_distance): dm = [] ids = [] for id1, seq1 in seqs: ids.append(id1) row = [] for id2, seq2 in seqs: row.append(hamming_distance(seq1, seq2)) dm.append(row) return DistanceMatrix(dm, ids)
def test_permute_not_condensed(self): obs = self.dm_1x1.permute() self.assertEqual(obs, self.dm_1x1) self.assertFalse(obs is self.dm_1x1) obs = self.dm_2x2.permute() self.assertEqual(obs, self.dm_2x2) self.assertFalse(obs is self.dm_2x2) np.random.seed(0) exp = DistanceMatrix([[0, 12, 4.2], [12, 0, 0.01], [4.2, 0.01, 0]], self.dm_3x3.ids) obs = self.dm_3x3.permute() self.assertEqual(obs, exp) exp = DistanceMatrix([[0, 4.2, 12], [4.2, 0, 0.01], [12, 0.01, 0]], self.dm_3x3.ids) obs = self.dm_3x3.permute() self.assertEqual(obs, exp)
def test_random_fn(self): """Test passing a different random function than the default.""" def myrand(num_rows, num_cols): # One dm to rule them all... data = np.empty((num_rows, num_cols)) data.fill(42) return data exp = DistanceMatrix( np.asarray([[0, 42, 42], [42, 0, 42], [42, 42, 0]]), ['1', '2', '3']) obs = randdm(3, random_fn=myrand) self.assertEqual(obs, exp)
def test_init_from_dm(self): """Constructs a dm from a dm.""" ids = ['foo', 'bar', 'baz'] # DissimilarityMatrix -> DissimilarityMatrix exp = DissimilarityMatrix(self.dm_3x3_data, ids) obs = DissimilarityMatrix(self.dm_3x3, ids) self.assertEqual(obs, exp) # Test that copy of data is not made. self.assertTrue(obs.data is self.dm_3x3.data) obs.data[0, 1] = 424242 self.assertTrue(np.array_equal(obs.data, self.dm_3x3.data)) # DistanceMatrix -> DissimilarityMatrix exp = DissimilarityMatrix(self.dm_3x3_data, ids) obs = DissimilarityMatrix( DistanceMatrix(self.dm_3x3_data, ('a', 'b', 'c')), ids) self.assertEqual(obs, exp) # DissimilarityMatrix -> DistanceMatrix with self.assertRaises(DistanceMatrixError): _ = DistanceMatrix(self.dm_2x2_asym, ['foo', 'bar'])
def setUp(self): data1 = [[0, 5, 9, 9, 8], [5, 0, 10, 10, 9], [9, 10, 0, 8, 7], [9, 10, 8, 0, 3], [8, 9, 7, 3, 0]] ids1 = list('abcde') self.dm1 = DistanceMatrix(data1, ids1) # this newick string was confirmed against http://www.trex.uqam.ca/ # which generated the following (isomorphic) newick string: # (d:2.0000,e:1.0000,(c:4.0000,(a:2.0000,b:3.0000):3.0000):2.0000); self.expected1_str = ("(d:2.000000, (c:4.000000, (b:3.000000," " a:2.000000):3.000000):2.000000, e:1.000000);") self.expected1_TreeNode = TreeNode.from_newick(self.expected1_str) # this example was pulled from the Phylip manual # http://evolution.genetics.washington.edu/phylip/doc/neighbor.html data2 = [[0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905], [1.6866, 0.0000, 1.5232, 1.4841, 1.4465, 1.4389, 1.4629], [1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583], [1.6606, 1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710], [1.5243, 1.4465, 0.5958, 0.4631, 0.0000, 0.3484, 0.3083], [1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692], [1.5905, 1.4629, 0.5583, 0.4710, 0.3083, 0.2692, 0.0000]] ids2 = [ "Bovine", "Mouse", "Gibbon", "Orang", "Gorilla", "Chimp", "Human" ] self.dm2 = DistanceMatrix(data2, ids2) self.expected2_str = ("(Mouse:0.76891, (Gibbon:0.35793, (Orang:0.28469" ", (Gorilla:0.15393, (Chimp:0.15167, Human:0.117" "53):0.03982):0.02696):0.04648):0.42027, Bovine:" "0.91769);") self.expected2_TreeNode = TreeNode.from_newick(self.expected2_str) data3 = [[0, 5, 4, 7, 6, 8], [5, 0, 7, 10, 9, 11], [4, 7, 0, 7, 6, 8], [7, 10, 7, 0, 5, 8], [6, 9, 6, 5, 0, 8], [8, 11, 8, 8, 8, 0]] ids3 = map(str, range(6)) self.dm3 = DistanceMatrix(data3, ids3) self.expected3_str = ("((((0:1.000000,1:4.000000):1.000000,2:2.000000" "):1.250000,5:4.750000):0.750000,3:2.750000,4:2." "250000);") self.expected3_TreeNode = TreeNode.from_newick(self.expected3_str)
def test_tip_tip_distances_endpoints(self): """Test getting specifc tip distances with tipToTipDistances""" t = TreeNode.from_newick('((H:1,G:1):2,(R:0.5,M:0.7):3);') nodes = [t.find('H'), t.find('G'), t.find('M')] names = ['H', 'G', 'M'] exp = DistanceMatrix(np.array([[0, 2.0, 6.7], [2.0, 0, 6.7], [6.7, 6.7, 0.0]]), ['H', 'G', 'M']) obs = t.tip_tip_distances(endpoints=names) self.assertEqual(obs, exp) obs = t.tip_tip_distances(endpoints=nodes) self.assertEqual(obs, exp)
def get_clusters(x_original, axis=['row', 'column'][0]): """Performs UPGMA clustering using euclidean distances""" x = x_original.copy() if axis == 'column': x = x.T nr = x.shape[0] metric_f = get_nonphylogenetic_metric('euclidean') row_dissims = DistanceMatrix(metric_f(x), map(str, range(nr))) # do upgma - rows # Average in SciPy's cluster.heirarchy.linkage is UPGMA linkage_matrix = linkage(row_dissims.condensed_form(), method='average') tree = TreeNode.from_linkage_matrix(linkage_matrix, row_dissims.ids) row_order = [int(tip.name) for tip in tree.tips()] return row_order
def distances(self): """Compute distances between all pairs of sequences Returns ------- skbio.core.distance.DistanceMatrix Matrix containing the distances between all pairs of sequences. Raises ------ skbio.core.exception.BiologicalSequenceError If ``len(self) != len(other)``. See Also -------- skbio.core.distance.DistanceMatrix scipy.spatial.distance.hamming Notes ----- Distances between sequences are computed as hamming distances, though this will be generalized (see #194). Examples -------- >>> from skbio.core.alignment import Alignment >>> from skbio.core.sequence import DNA >>> seqs = [DNA("A-CCGGG", identifier="s1"), ... DNA("ATCC--G", identifier="s2"), ... DNA("ATCCGGA", identifier="s3")] >>> a1 = Alignment(seqs) >>> print a1.distances() 3x3 distance matrix IDs: s1, s2, s3 Data: [[ 0. 0.42857143 0.28571429] [ 0.42857143 0. 0.42857143] [ 0.28571429 0.42857143 0. ]] """ sequence_count = self.sequence_count() dm = np.zeros((sequence_count, sequence_count)) identifiers = [] for i in xrange(sequence_count): self_i = self[i] identifiers.append(self_i.identifier) for j in xrange(i): dm[i, j] = dm[j, i] = self_i.distance(self[j]) return DistanceMatrix(dm, identifiers)
def setUp(self): self.dm = DistanceMatrix( [[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]], ['a', 'b', 'c']) self.grouping = [1, 2, 1] # Ordering of IDs shouldn't matter, nor should extra IDs. self.df = pd.read_csv( StringIO('ID,Group\nb,Group1\na,Group2\nc,Group1\nd,Group3'), index_col=0) self.df_missing_id = pd.read_csv( StringIO('ID,Group\nb,Group1\nc,Group1'), index_col=0) self.categorical_stats = CategoricalStats(self.dm, self.grouping) self.categorical_stats_from_df = CategoricalStats(self.dm, self.df, column='Group')
def guide_tree_from_query_sequences(query_sequences, distance_fn=three_mer_distance, display_tree = False): guide_dm = [] seq_ids = [] for seq_id1, seq1 in query_sequences: seq_ids.append(seq_id1) row = [] for seq_id2, seq2 in query_sequences: row.append(kmer_distance(seq1, seq2, k=3)) guide_dm.append(row) guide_dm = DistanceMatrix(guide_dm, seq_ids) guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def test_default_usage(self): """Test generating random distance matrices.""" exp = DistanceMatrix(np.asarray([[0.0]]), ['1']) obs = randdm(1) self.assertEqual(obs, exp) obs = randdm(2) self.assertEqual(obs.shape, (2, 2)) self.assertEqual(obs.ids, ('1', '2')) obs1 = randdm(5) num_trials = 10 found_diff = False for _ in range(num_trials): obs2 = randdm(5) if obs1 != obs2: found_diff = True break self.assertTrue(found_diff)
def test_nj_trivial(self): data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]] dm = DistanceMatrix(data, list('abc')) expected_str = "(b:2.000000, a:1.000000, c:1.000000);" self.assertEqual(nj(dm, result_constructor=str), expected_str)
def setUp(self): self.dm = DistanceMatrix( [[0.0, 1.0, 2.0], [1.0, 0.0, 3.0], [2.0, 3.0, 0.0]], ['a', 'b', 'c']) self.categorical_stats = CategoricalStats(self.dm, [1, 2, 1])
def mantel(x, y, method='pearson', permutations=999, alternative='two-sided'): """Compute correlation between distance matrices using the Mantel test. The Mantel test compares two distance matrices by computing the correlation between the distances in the lower (or upper) triangular portions of the symmetric distance matrices. Correlation can be computed using Pearson's product-moment correlation coefficient or Spearman's rank correlation coefficient. As defined in [1]_, the Mantel test computes a test statistic :math:`r_M` given two symmetric distance matrices :math:`D_X` and :math:`D_Y`. :math:`r_M` is defined as .. math:: r_M=\\frac{1}{d-1}\\sum_{i=1}^{n-1}\\sum_{j=i+1}^{n} stand(D_X)_{ij}stand(D_Y)_{ij} where .. math:: d=\\frac{n(n-1)}{2} and :math:`n` is the number of rows/columns in each of the distance matrices. :math:`stand(D_X)` and :math:`stand(D_Y)` are distance matrices with their upper triangles containing standardized distances. Note that since :math:`D_X` and :math:`D_Y` are symmetric, the lower triangular portions of the matrices could equivalently have been used instead of the upper triangular portions (the current function behaves in this manner). If ``method='spearman'``, the above equation operates on ranked distances instead of the original distances. Statistical significance is assessed via a permutation test. The rows and columns of the first distance matrix (`x`) are randomly permuted a number of times (controlled via `permutations`). A correlation coefficient is computed for each permutation and the p-value is the proportion of permuted correlation coefficients that are equal to or more extreme than the original (unpermuted) correlation coefficient. Whether a permuted correlation coefficient is "more extreme" than the original correlation coefficient depends on the alternative hypothesis (controlled via `alternative`). Parameters ---------- x, y : array_like or DistanceMatrix Input distance matrices to compare. Both matrices must have the same shape and be at least 3x3 in size. If ``array_like``, will be cast to ``DistanceMatrix`` (thus the requirements of a valid ``DistanceMatrix`` apply to both `x` and `y`, such as symmetry and hollowness). If inputs are already ``DistanceMatrix`` instances, the IDs do not need to match between them; they are assumed to both be in the same order regardless of their IDs (the underlying data matrix is the only thing considered by this function). method : {'pearson', 'spearman'} Method used to compute the correlation between distance matrices. permutations : int, optional Number of times to randomly permute `x` when assessing statistical significance. Must be greater than or equal to zero. If zero, statistical significance calculations will be skipped and the p-value will be ``np.nan``. alternative : {'two-sided', 'greater', 'less'} Alternative hypothesis to use when calculating statistical significance. The default ``'two-sided'`` alternative hypothesis calculates the proportion of permuted correlation coefficients whose magnitude (i.e. after taking the absolute value) is greater than or equal to the absolute value of the original correlation coefficient. ``'greater'`` calculates the proportion of permuted coefficients that are greater than or equal to the original coefficient. ``'less'`` calculates the proportion of permuted coefficients that are less than or equal to the original coefficient. Returns ------- tuple of floats Correlation coefficient and p-value of the test. Raises ------ ValueError If `x` and `y` are not the same shape and at least 3x3 in size, or an invalid `method`, number of `permutations`, or `alternative` are provided. See Also -------- DistanceMatrix scipy.stats.pearsonr scipy.stats.spearmanr Notes ----- The Mantel test was first described in [2]_. The general algorithm and interface are similar to ``vegan::mantel``, available in R's vegan package [3]_. ``np.nan`` will be returned for the p-value if `permutations` is zero or if the correlation coefficient is ``np.nan``. The correlation coefficient will be ``np.nan`` if one or both of the inputs does not have any variation (i.e. the distances are all constant) and ``method='spearman'``. References ---------- .. [1] Legendre, P. and Legendre, L. (2012) Numerical Ecology. 3rd English Edition. Elsevier. .. [2] Mantel, N. (1967). "The detection of disease clustering and a generalized regression approach". Cancer Research 27 (2): 209-220. PMID 6018555. .. [3] http://cran.r-project.org/web/packages/vegan/index.html Examples -------- Define two 3x3 distance matrices: >>> x = [[0, 1, 2], ... [1, 0, 3], ... [2, 3, 0]] >>> y = [[0, 2, 7], ... [2, 0, 6], ... [7, 6, 0]] Compute the Pearson correlation between them and assess significance using a two-sided test with 999 permutations: >>> coeff, p_value = mantel(x, y) >>> round(coeff, 4) 0.7559 Thus, we see a moderate-to-strong positive correlation (:math:`r_M=0.7559`) between the two matrices. """ if method == 'pearson': corr_func = pearsonr elif method == 'spearman': corr_func = spearmanr else: raise ValueError("Invalid correlation method '%s'." % method) if permutations < 0: raise ValueError("Number of permutations must be greater than or " "equal to zero.") if alternative not in ('two-sided', 'greater', 'less'): raise ValueError("Invalid alternative hypothesis '%s'." % alternative) x = DistanceMatrix(x) y = DistanceMatrix(y) if x.shape != y.shape: raise ValueError("Distance matrices must have the same shape.") if x.shape[0] < 3: raise ValueError("Distance matrices must be at least 3x3 in size.") x_flat = x.condensed_form() y_flat = y.condensed_form() orig_stat = corr_func(x_flat, y_flat)[0] if permutations == 0 or np.isnan(orig_stat): p_value = np.nan else: perm_gen = (corr_func(x.permute(condensed=True), y_flat)[0] for _ in range(permutations)) permuted_stats = np.fromiter(perm_gen, np.float, count=permutations) if alternative == 'two-sided': count_better = (np.absolute(permuted_stats) >= np.absolute(orig_stat)).sum() elif alternative == 'greater': count_better = (permuted_stats >= orig_stat).sum() else: count_better = (permuted_stats <= orig_stat).sum() p_value = (count_better + 1) / (permutations + 1) return orig_stat, p_value
def setup(self): """Sample data set from page 111 of W.J Krzanowski. Principles of multivariate analysis, 2000, Oxford University Press.""" matrix = np.loadtxt(get_data_path('PCoA_sample_data')) dist_matrix = DistanceMatrix(matrix, map(str, range(matrix.shape[0]))) self.dist_matrix = dist_matrix
def setup(self): matrix = np.loadtxt(get_data_path('PCoA_sample_data_2')) self.ids = [str(i) for i in range(matrix.shape[0])] dist_matrix = DistanceMatrix(matrix, self.ids) self.ordination = PCoA(dist_matrix)
def run_mantel_test(method, fps, distmats, num_perms, tail_type, comment, control_dm_fp=None, control_dm=None, sample_id_map=None): """Runs a Mantel test on all pairs of distance matrices. Returns a string suitable for writing out to a file containing the results of the test. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: method - which Mantel test to run (either 'mantel' or 'partial_mantel') fps - list of filepaths of the distance matrices distmats - list of tuples containing dm labels and dm data (i.e. the output of parse_distmat) num_perms - the number of permutations to use to calculate the p-value(s) tail_type - the type of tail test to use when calculating the p-value(s). Can be 'two sided', 'greater', or 'less'. Only applies when method is mantel comment - comment string to add to the beginning of the results string control_dm_fp - filepath of the control distance matrix. Only applies when method is partial_mantel (it is required then) control_dm - tuple containing control distance matrix labels and matrix data. Only applies when method is partial_mantel (it is required then) sample_id_map - dict mapping sample IDs (i.e. what is expected by make_compatible_distance_matrices) """ if len(fps) != len(distmats): raise ValueError("Must provide the same number of filepaths as there " "are distance matrices.") if comment is None: comment = '' result = comment if method == 'mantel': result += 'DM1\tDM2\tNumber of entries\tMantel r statistic\t' + \ 'p-value\tNumber of permutations\tTail type\n' elif method == 'partial_mantel': if not control_dm_fp or not control_dm: raise ValueError("You must provide a control matrix filepath and " "control matrix when running the partial Mantel " "test.") result += 'DM1\tDM2\tCDM\tNumber of entries\t' + \ 'Mantel r statistic\tp-value\tNumber of permutations\t' +\ 'Tail type\n' else: raise ValueError("Invalid method '%s'. Must be either 'mantel' or " "'partial_mantel'." % method) # Loop over all pairs of dms. for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)): for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]: # Make the current pair of distance matrices compatible by only # keeping samples that match between them, and ordering them by # the same sample IDs. (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices((dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if method == 'partial_mantel': # We need to intersect three sets (three matrices). (dm1_labels, dm1_data), (cdm_labels, cdm_data) = \ make_compatible_distance_matrices( (dm1_labels, dm1_data), control_dm, lookup=sample_id_map) (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices( (dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if len(dm1_labels) < 3: result += '%s\t%s\t%s\t%d\tToo few samples\n' % ( fp1, fp2, control_dm_fp, len(dm1_labels)) continue elif len(dm1_labels) < 3: result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, len(dm1_labels)) continue dm1 = DistanceMatrix(dm1_data, dm1_labels) dm2 = DistanceMatrix(dm2_data, dm2_labels) # Create an instance of our correlation test and run it with # the specified number of permutations. if method == 'mantel': results = Mantel(dm1, dm2, tail_type)(num_perms) p_str = format_p_value_for_num_iters(results['p_value'], num_perms) result += "%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % ( fp1, fp2, len(dm1_labels), results['r_value'], p_str, num_perms, tail_type) elif method == 'partial_mantel': cdm = DistanceMatrix(cdm_data, cdm_labels) results = PartialMantel(dm1, dm2, cdm)(num_perms) p_str = format_p_value_for_num_iters(results['mantel_p'], num_perms) result += "%s\t%s\t%s\t%d\t%.5f\t%s\t%d\t%s\n" % ( fp1, fp2, control_dm_fp, len(dm1_labels), results['mantel_r'], p_str, num_perms, 'greater') return result
def run_mantel_correlogram(fps, distmats, num_perms, comment, alpha, sample_id_map=None, variable_size_distance_classes=False): """Runs a Mantel correlogram analysis on all pairs of distance matrices. Returns a string suitable for writing out to a file containing the results of the test, a list of correlogram filepath names, and a list of matplotlib Figure objects representing each correlogram. The correlogram filepaths can have an extension string appended to the end of them and then be used to save each of the correlogram Figures to a file. Each correlogram filepath will be a combination of the two distance matrix filepaths that were used to create it. WARNING: Only symmetric, hollow distance matrices may be used as input. Asymmetric distance matrices, such as those obtained by the UniFrac Gain metric (i.e. beta_diversity.py -m unifrac_g), should not be used as input. Arguments: fps - list of filepaths of the distance matrices distmats - list of tuples containing dm labels and dm data (i.e. the output of parse_distmat) num_perms - the number of permutations to use to calculate the p-value(s) comment - comment string to add to the beginning of the results string alpha - the alpha value to use to determine significance in the correlogram plots sample_id_map - dict mapping sample IDs (i.e. what is expected by make_compatible_distance_matrices) variable_size_distance_classes - create distance classes that vary in size (i.e. width) but have the same number of distances in each class """ if len(fps) != len(distmats): raise ValueError("Must provide the same number of filepaths as there " "are distance matrices.") if comment is None: comment = '' result = comment + 'DM1\tDM2\tNumber of entries\t' + \ 'Number of permutations\tClass index\t' + \ 'Number of distances\tMantel r statistic\t' + \ 'p-value\tp-value (Bonferroni corrected)\tTail type\n' correlogram_fps = [] correlograms = [] # Loop over all pairs of dms. for i, (fp1, (dm1_labels, dm1_data)) in enumerate(zip(fps, distmats)): for fp2, (dm2_labels, dm2_data) in zip(fps, distmats)[i + 1:]: # Make the current pair of distance matrices compatible by only # keeping samples that match between them, and ordering them by # the same sample IDs. (dm1_labels, dm1_data), (dm2_labels, dm2_data) = \ make_compatible_distance_matrices((dm1_labels, dm1_data), (dm2_labels, dm2_data), lookup=sample_id_map) if len(dm1_labels) < 3: result += '%s\t%s\t%d\tToo few samples\n' % (fp1, fp2, len(dm1_labels)) continue dm1 = DistanceMatrix(dm1_data, dm1_labels) dm2 = DistanceMatrix(dm2_data, dm2_labels) # Create an instance of our Mantel correlogram test and run it with # the specified number of permutations. mc = MantelCorrelogram( dm1, dm2, alpha=alpha, variable_size_distance_classes=variable_size_distance_classes) results = mc(num_perms) # Generate a name for the current correlogram and save it and the # correlogram itself. dm1_name = path.basename(fp1) dm2_name = path.basename(fp2) correlogram_fps.append('_'.join((dm1_name, 'AND', dm2_name, 'mantel_correlogram')) + '.') correlograms.append(results['correlogram_plot']) # Iterate over the results and write them to the text file. first_time = True for class_idx, num_dist, r, p, p_corr in zip( results['class_index'], results['num_dist'], results['mantel_r'], results['mantel_p'], results['mantel_p_corr']): # Format p-values and figure out which tail type we have based # on the sign of r. p_str = None if p is not None: p_str = format_p_value_for_num_iters(p, num_perms) p_corr_str = None if p_corr is not None: p_corr_str = format_p_value_for_num_iters( p_corr, num_perms) if r is None: tail_type = None elif r < 0: tail_type = 'less' else: tail_type = 'greater' if first_time: result += '%s\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\n' % ( fp1, fp2, len(dm1_labels), num_perms, class_idx, num_dist, r, p_str, p_corr_str, tail_type) first_time = False else: result += '\t\t\t\t%s\t%d\t%s\t%s\t%s\t%s\n' % ( class_idx, num_dist, r, p_str, p_corr_str, tail_type) return result, correlogram_fps, correlograms
def test_nj_error(self): data = [[0, 3], [3, 0]] dm = DistanceMatrix(data, list('ab')) self.assertRaises(ValueError, nj, dm)