def test_from_iterable_validate_equal_valid_data(self): validate_true = DistanceMatrix.from_iterable((x for x in range(4)), lambda a, b: abs(b - a), validate=True) validate_false = DistanceMatrix.from_iterable((x for x in range(4)), lambda a, b: abs(b - a), validate=False) self.assertEqual(validate_true, validate_false)
def test_from_iterable_validate_false_non_symmetric(self): exp = DistanceMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]]) res = DistanceMatrix.from_iterable((x for x in range(4)), lambda a, b: a - b, validate=False) self.assertEqual(res, exp)
def test_from_iterable_no_key(self): iterable = (x for x in range(4)) exp = DistanceMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]]) res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a)) self.assertEqual(res, exp)
def test_init_invalid_input(self): # Asymmetric. data = [[0.0, 2.0], [1.0, 0.0]] with self.assertRaises(DistanceMatrixError): DistanceMatrix(data, ['a', 'b']) # Ensure that the superclass validation is still being performed. with self.assertRaises(DissimilarityMatrixError): DistanceMatrix([[1, 2, 3]], ['a'])
def setUp(self): super(DistanceMatrixTests, self).setUp() self.dm_1x1 = DistanceMatrix(self.dm_1x1_data, ['a']) self.dm_2x2 = DistanceMatrix(self.dm_2x2_data, ['a', 'b']) self.dm_3x3 = DistanceMatrix(self.dm_3x3_data, ['a', 'b', 'c']) self.dms = [self.dm_1x1, self.dm_2x2, self.dm_3x3] self.dm_condensed_forms = [np.array([]), np.array([0.123]), np.array([0.01, 4.2, 12.0])]
def test_from_file_with_file_path(self): """Should identify the filepath correctly and parse from it.""" # should fail with the expected exception with self.assertRaises(DissimilarityMatrixFormatError): DistanceMatrix.from_file(self.bad_dm_fp) obs = DistanceMatrix.from_file(self.dm_3x3_fp) self.assertEqual(self.dm_3x3, obs) self.assertTrue(isinstance(obs, DistanceMatrix))
def test_from_iterable_with_keys(self): iterable = (x for x in range(4)) exp = DistanceMatrix( [[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]], ['0', '1', '4', '9']) res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a), keys=iter(['0', '1', '4', '9'])) self.assertEqual(res, exp)
def test_to_series_4x4(self): dm = DistanceMatrix([ [0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5], [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd']) series = dm.to_series() exp = pd.Series([0.25, 0.75, 0.75, 0.25, 0.5, 0.5, 0.75, 0.5, 0.75, 0.5], index = [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'a'), ('b', 'c'), ('b', 'd'), ('c', 'a'), ('c', 'b'), ('d', 'a'), ('d', 'b')]) assert_series_almost_equal(series, exp)
def test_to_series_4x4(self): dm = DistanceMatrix([[0.0, 0.2, 0.3, 0.4], [0.2, 0.0, 0.5, 0.6], [0.3, 0.5, 0.0, 0.7], [0.4, 0.6, 0.7, 0.0]], ['a', 'b', 'c', 'd']) series = dm.to_series() exp = pd.Series([0.2, 0.3, 0.4, 0.5, 0.6, 0.7], index=pd.Index([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')])) assert_series_almost_equal(series, exp)
def setUp(self): data1 = [[0, 5, 9, 9, 8], [5, 0, 10, 10, 9], [9, 10, 0, 8, 7], [9, 10, 8, 0, 3], [8, 9, 7, 3, 0]] ids1 = list('abcde') self.dm1 = DistanceMatrix(data1, ids1) # this newick string was confirmed against http://www.trex.uqam.ca/ # which generated the following (isomorphic) newick string: # (d:2.0000,e:1.0000,(c:4.0000,(a:2.0000,b:3.0000):3.0000):2.0000); self.expected1_str = ("(d:2.000000, (c:4.000000, (b:3.000000," " a:2.000000):3.000000):2.000000, e:1.000000);") self.expected1_TreeNode = TreeNode.read(StringIO(self.expected1_str)) # this example was pulled from the Phylip manual # http://evolution.genetics.washington.edu/phylip/doc/neighbor.html data2 = [[0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905], [1.6866, 0.0000, 1.5232, 1.4841, 1.4465, 1.4389, 1.4629], [1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583], [1.6606, 1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710], [1.5243, 1.4465, 0.5958, 0.4631, 0.0000, 0.3484, 0.3083], [1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692], [1.5905, 1.4629, 0.5583, 0.4710, 0.3083, 0.2692, 0.0000]] ids2 = ["Bovine", "Mouse", "Gibbon", "Orang", "Gorilla", "Chimp", "Human"] self.dm2 = DistanceMatrix(data2, ids2) self.expected2_str = ("(Mouse:0.76891, (Gibbon:0.35793, (Orang:0.28469" ", (Gorilla:0.15393, (Chimp:0.15167, Human:0.117" "53):0.03982):0.02696):0.04648):0.42027, Bovine:" "0.91769);") self.expected2_TreeNode = TreeNode.read(StringIO(self.expected2_str)) data3 = [[0, 5, 4, 7, 6, 8], [5, 0, 7, 10, 9, 11], [4, 7, 0, 7, 6, 8], [7, 10, 7, 0, 5, 8], [6, 9, 6, 5, 0, 8], [8, 11, 8, 8, 8, 0]] ids3 = map(str, range(6)) self.dm3 = DistanceMatrix(data3, ids3) self.expected3_str = ("((((0:1.000000,1:4.000000):1.000000,2:2.000000" "):1.250000,5:4.750000):0.750000,3:2.750000,4:2." "250000);") self.expected3_TreeNode = TreeNode.read(StringIO(self.expected3_str)) # this dm can yield negative branch lengths data4 = [[0, 5, 9, 9, 800], [5, 0, 10, 10, 9], [9, 10, 0, 8, 7], [9, 10, 8, 0, 3], [800, 9, 7, 3, 0]] ids4 = list('abcde') self.dm4 = DistanceMatrix(data4, ids4)
def progressive_msa_and_tree(sequences, pairwise_aligner, metric=kmer_distance, guide_tree=None, display_aln=False, display_tree=False): """ Perform progressive msa of sequences and build a UPGMA tree Parameters ---------- sequences : skbio.SequenceCollection The sequences to be aligned. pairwise_aligner : function Function that should be used to perform the pairwise alignments, for example skbio.alignment.global_pairwise_align_nucleotide. Must support skbio.Sequence objects or skbio.TabularMSA objects as input. metric : function, optional Function that returns a single distance value when given a pair of skbio.Sequence objects. This will be used to build a guide tree if one is not provided. guide_tree : skbio.TreeNode, optional The tree that should be used to guide the alignment process. display_aln : bool, optional Print the alignment before returning. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.alignment skbio.TreeNode """ if guide_tree is None: guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids) msa = progressive_msa(sequences, guide_tree, pairwise_aligner=pairwise_aligner) if display_aln: print(msa) msa_dm = DistanceMatrix.from_iterable(msa, metric=metric, key='id') msa_lm = average(msa_dm.condensed_form()) msa_tree = TreeNode.from_linkage_matrix(msa_lm, msa_dm.ids) if display_tree: print("\nOutput tree:") d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right', link_color_func=lambda x: 'black', leaf_font_size=24) return msa, msa_tree
def test_fsvd(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm3 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) # Test eigh vs. fsvd pcoa and inplace parameter expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3, inplace=False) results = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=False) results_inplace = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=True) assert_ordination_results_equal(results, expected_results, ignore_directionality=True, ignore_method_names=True) assert_ordination_results_equal(results, results_inplace, ignore_directionality=True, ignore_method_names=True) # Test number_of_dimensions edge cases results2 = pcoa(dm3, method="fsvd", number_of_dimensions=0, inplace=False) expected_results2 = pcoa(dm3, method="fsvd", number_of_dimensions=dm3.data.shape[0], inplace=False) assert_ordination_results_equal(results2, expected_results2, ignore_directionality=True, ignore_method_names=True) with self.assertRaises(ValueError): dim_too_large = dm1.data.shape[0] + 10 pcoa(dm2, method="fsvd", number_of_dimensions=dim_too_large) with self.assertRaises(ValueError): pcoa(dm2, method="fsvd", number_of_dimensions=-1) with self.assertRaises(ValueError): dim_too_large = dm1.data.shape[0] + 10 pcoa(dm2, method="eigh", number_of_dimensions=dim_too_large) with self.assertRaises(ValueError): pcoa(dm2, method="eigh", number_of_dimensions=-1) dm_big = DistanceMatrix.read(get_data_path('PCoA_sample_data_12dim')) with self.assertWarnsRegex(RuntimeWarning, "no value for number_of_dimensions"): pcoa(dm_big, method="fsvd", number_of_dimensions=0)
def test_fsvd_inplace(self): dm1 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) dm2 = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) expected_results = pcoa(dm1, method="eigh", number_of_dimensions=3, inplace=True) results = pcoa(dm2, method="fsvd", number_of_dimensions=3, inplace=True) assert_ordination_results_equal(results, expected_results, ignore_directionality=True, ignore_method_names=True)
def distmat_corr(truthfile, distfile, reps=3, corrstat=spearman): '''Returns correlation between condensed distance matrices, using corrstat''' distmat = DistanceMatrix.read(distfile) truthmat = DistanceMatrix.read(truthfile) truthmat = sample_matrix_to_runs(truthmat, reps) ids = list(sorted(distmat.ids)) t_ids = list(sorted(truthmat.ids)) assert ids == t_ids, (ids, t_ids) dist = distmat.filter(ids).condensed_form() truth = truthmat.filter(ids).condensed_form() return corrstat(truth, dist)
def test_to_series_4x4(self): dm = DistanceMatrix([ [0.0, 0.2, 0.3, 0.4], [0.2, 0.0, 0.5, 0.6], [0.3, 0.5, 0.0, 0.7], [0.4, 0.6, 0.7, 0.0]], ['a', 'b', 'c', 'd']) series = dm.to_series() exp = pd.Series([0.2, 0.3, 0.4, 0.5, 0.6, 0.7], index=pd.Index([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')])) assert_series_almost_equal(series, exp)
def test_empty(self): # array of empty vectors actual = beta_diversity('euclidean', np.array([[], []], dtype=np.int64), ids=['a', 'b']) expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b']) npt.assert_array_equal(actual, expected_dm) actual = beta_diversity('unweighted_unifrac', np.array([[], []], dtype=np.int64), ids=['a', 'b'], tree=self.tree1, otu_ids=[]) expected_dm = DistanceMatrix([[0.0, 0.0], [0.0, 0.0]], ['a', 'b']) self.assertEqual(actual, expected_dm)
def setUp(self): self.minx = DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]]) self.miny = DistanceMatrix([[0, 2, 7], [2, 0, 6], [7, 6, 0]]) self.minz = DistanceMatrix([[0, 0.5, 0.25], [0.5, 0, 0.1], [0.25, 0.1, 0]]) self.min_dms = (self.minx, self.miny, self.minz) # Versions of self.minx and self.minz (above) that each have an extra # ID on the end. self.x_extra = DistanceMatrix([[0, 1, 2, 7], [1, 0, 3, 2], [2, 3, 0, 4], [7, 2, 4, 0]], ['0', '1', '2', 'foo']) self.z_extra = DistanceMatrix([[0, 0.5, 0.25, 3], [0.5, 0, 0.1, 24], [0.25, 0.1, 0, 5], [3, 24, 5, 0]], ['0', '1', '2', 'bar']) # Load expected results. We have to load the p-value column (column # index 3) as a string dtype in order to compare with the in-memory # results since we're formatting the p-values as strings with the # correct number of decimal places. Without this explicit converter, # the p-value column will be loaded as a float dtype and the frames # won't compare equal. p_val_conv = {3: str} self.exp_results_minimal = pd.read_csv( get_data_path('pwmantel_exp_results_minimal.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_minimal_with_labels = pd.read_csv( get_data_path('pwmantel_exp_results_minimal_with_labels.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_duplicate_dms = pd.read_csv( get_data_path('pwmantel_exp_results_duplicate_dms.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_na_p_value = pd.read_csv( get_data_path('pwmantel_exp_results_na_p_value.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_too_few_permutations = pd.read_csv( get_data_path('pwmantel_exp_results_too_few_permutations.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv) self.exp_results_reordered_distance_matrices = pd.read_csv( get_data_path('pwmantel_exp_results_reordered_distance_matrices' '.txt'), sep='\t', index_col=(0, 1), converters=p_val_conv)
def test_compute_collapsed_dm(self): expected_data = [[0, 7, 7, 6], [7, 0, 8, 7], [7, 8, 0, 3], [6, 7, 3, 0]] expected_ids = ['x', 'c', 'd', 'e'] expected1 = DistanceMatrix(expected_data, expected_ids) self.assertEqual(_compute_collapsed_dm(self.dm1, 'a', 'b', True, 'x'), expected1) # computed manually expected_data = [[0, 4, 3], [4, 0, 3], [3, 3, 0]] expected_ids = ['yy', 'd', 'e'] expected2 = DistanceMatrix(expected_data, expected_ids) self.assertEqual( _compute_collapsed_dm(expected1, 'x', 'c', True, 'yy'), expected2)
def test_compute_q(self): expected_data = [[0, -50, -38, -34, -34], [-50, 0, -38, -34, -34], [-38, -38, 0, -40, -40], [-34, -34, -40, 0, -48], [-34, -34, -40, -48, 0]] expected_ids = list('abcde') expected = DistanceMatrix(expected_data, expected_ids) self.assertEqual(_compute_q(self.dm1), expected) data = [[0, 3, 2], [3, 0, 3], [2, 3, 0]] dm = DistanceMatrix(data, list('abc')) # computed this manually expected_data = [[0, -8, -8], [-8, 0, -8], [-8, -8, 0]] expected = DistanceMatrix(expected_data, list('abc')) self.assertEqual(_compute_q(dm), expected)
def test_distances(self): s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def test_distances(self): expected = [[0, 6. / 13, 4. / 13], [6. / 13, 0, 7. / 13], [4. / 13, 7. / 13, 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances() self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42., 42.], [42., 0, 42.], [42., 42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2', 'd3']) actual = self.a1.distances(dumb_distance) self.assertEqual(actual, expected)
def guide_tree_from_sequences(sequences, metric=kmer_distance, display_tree=False): """ Build a UPGMA tree by applying metric to sequences Parameters ---------- sequences : list of skbio.Sequence objects (or subclasses) The sequences to be represented in the resulting guide tree. metric : function Function that returns a single distance value when given a pair of skbio.Sequence objects. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = DistanceMatrix.from_iterable(sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def aln_distmat(alignment, reps=3): '''Calculate pairwise distances from a MSA of genomes''' aln = TabularMSA.read(alignment, constructor=DNA) aln.reassign_index(minter="id") dist = DistanceMatrix.from_iterable([seq.values for seq in aln], metric=hamming, keys=aln.index) return dist
def bioenv(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata) -> None: # convert metadata to numeric values where applicable, drop the non-numeric # values, and then drop samples that contain NaNs df = metadata.to_dataframe() df = df.apply(lambda x: pd.to_numeric(x, errors='ignore')) # filter categorical columns pre_filtered_cols = set(df.columns) df = df.select_dtypes([numpy.number]).dropna() filtered_categorical_cols = pre_filtered_cols - set(df.columns) # filter 0 variance numerical columns pre_filtered_cols = set(df.columns) df = df.loc[:, df.var() != 0] filtered_zero_variance_cols = pre_filtered_cols - set(df.columns) # filter the distance matrix to exclude samples that were dropped from # the metadata, and keep track of how many samples survived the filtering # so that information can be presented to the user. initial_dm_length = distance_matrix.shape[0] distance_matrix = distance_matrix.filter(df.index, strict=False) filtered_dm_length = distance_matrix.shape[0] result = skbio.stats.distance.bioenv(distance_matrix, df) result = result.to_html(classes='table table-striped table-hover').replace( 'border="1"', 'border="0"') index = os.path.join(TEMPLATES, 'bioenv_assets', 'index.html') q2templates.render(index, output_dir, context={ 'initial_dm_length': initial_dm_length, 'filtered_dm_length': filtered_dm_length, 'filtered_categorical_cols': ', '.join(filtered_categorical_cols), 'filtered_zero_variance_cols': ', '.join(filtered_zero_variance_cols), 'result': result})
def test_simple(self): eigvals = [0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0] proportion_explained = [0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0] sample_ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634'] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame( np.loadtxt(get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def drawTree(MS_distDict, Methyl_distDict, filtered_samples, ratio, outgroup): ''' Merge MS and Methyl distance matrices ''' merged_distMatrix = [] for sample1 in sorted(filtered_samples): sample1_dist = [] for sample2 in sorted(filtered_samples): merged_dist = (MS_distDict[sample1][sample2] * ratio) + ( Methyl_distDict[sample1][sample2] * (1 - ratio) ) / 100 #We want to scale methyl PD dist properly because PD is calculated from a 0-100 scale while MS dist is 0-1 scale sample1_dist.append(merged_dist) merged_distMatrix.append(sample1_dist) ''' Run neighbor-joining phylogenetic tree building algorithm on pairwise cell distance (saved in distDict) ''' distObj = DistanceMatrix(merged_distMatrix, sorted(filtered_samples)) print(distObj.data) skbio_tree = nj(distObj, result_constructor=str) ete_tree = Tree( skbio_tree ) #We use skbio to first make a tree from distance matrix then convert to ete tree if outgroup is "NA": return ete_tree else: if outgroup == "Midpoint": tree_midpoint = ete_tree.get_midpoint_outgroup() ete_tree.set_outgroup(tree_midpoint) else: ete_tree.set_outgroup(outgroup) return ete_tree
def test_confirm_betadispr_results(self): mp_dm = DistanceMatrix.read(get_data_path('moving_pictures_dm.tsv')) mp_mf = pd.read_csv(get_data_path('moving_pictures_mf.tsv'), sep='\t') mp_mf.set_index('#SampleID', inplace=True) obs_med_mp = permdisp(mp_dm, mp_mf, column='BodySite') obs_cen_mp = permdisp(mp_dm, mp_mf, column='BodySite', test='centroid') exp_data_m = ['PERMDISP', 'F-value', 33, 4, 10.1956, 0.001, 999] exp_data_c = ['PERMDISP', 'F-value', 33, 4, 17.4242, 0.001, 999] exp_ind = ['method name', 'test statistic name', 'sample size', 'number of groups', 'test statistic', 'p-value', 'number of permutations'] exp_med_mp = pd.Series(data=exp_data_m, index=exp_ind, dtype='object', name='PERMDISP results') exp_cen_mp = pd.Series(data=exp_data_c, index=exp_ind, dtype='object', name='PERMDISP results') self.assert_series_equal(exp_med_mp, obs_med_mp) self.assert_series_equal(exp_cen_mp, obs_cen_mp)
def effect_size(mappings, alphas, betas, output, jobs, permutations, overwrite, na_values): # As we can have multiple mapping, alpha or beta files, we will construct # a mfs dictionary with all the dataframes. Additionally, we will load the # data_dictionary.csv file so we can use it to process the data mappings = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in mappings} for m, mf in mappings.items(): mappings[m].set_index('#SampleID', inplace=True) if betas: betas = {f: DistanceMatrix.read(f) for f in betas} with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par: par(joblib.delayed( _process_column)(bf, c, fname, finfo, alphas, betas, permutations) for bf, c, fname, finfo in _generate_betas( betas, mappings, permutations, output, overwrite)) else: alphas = {f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in alphas} for a, af in alphas.items(): alphas[a].set_index('#SampleID', inplace=True) for af, c, fname, finfo in _generate_alphas(alphas, mappings, output, overwrite): _process_column(af, c, fname, finfo, alphas, betas, permutations)
def setUp(self): # Crawford dataset for unweighted UniFrac fp = get_data_path('PCoA_sample_data_3') self.ordination = pcoa(DistanceMatrix.read(fp)) fp = get_data_path('PCoA_biplot_descriptors') self.descriptors = pd.read_table(fp, index_col='Taxon').T
def test_heatmap_extra_tips(self): # Adds in test scenario where there more tips than features # in the table np.random.seed(0) num_otus = 11 # otus index = np.arange(5).astype(np.str) table = pd.DataFrame(np.random.random((len(index), num_otus)), index=index, columns=np.arange(num_otus).astype(np.str)) x = np.random.rand(num_otus*2) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3 md = MetadataCategory( pd.Series(['a', 'a', 'a', 'b', 'b'], index=index)) dendrogram_heatmap(self.results, table, t, md) index_fp = os.path.join(self.results, 'index.html') self.assertTrue(os.path.exists(index_fp)) with open(index_fp, 'r') as fh: html = fh.read() self.assertIn('<h1>Dendrogram heatmap</h1>', html)
def fromSequences(cls, labels, sequences, findParams=None, **kwargs): """ Construct an NJTree instance from some seqeunces. @param cls: Our class. @param labels: An iterable producing C{str} labels for the sequences. @param sequences: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param findParams: An instance of C{FindParameters}. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @return: An C{NJTree} instance. """ if isinstance(sequences, str): sequences = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) new = cls() new.sequences = list(sequences) new.labels = labels findParams = findParams or FindParameters() affinity = np.array( affinityMatrix(new.sequences, findParams=findParams, **kwargs)) new.distance = np.ones(affinity.shape) - affinity new.tree = nj(DistanceMatrix(new.distance, labels)) return new
def effect_size(mappings, alphas, betas, output, jobs, permutations, overwrite, na_values): # As we can have multiple mapping, alpha or beta files, we will construct # a mfs dictionary with all the dataframes. Additionally, we will load the # data_dictionary.csv file so we can use it to process the data mappings = { f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in mappings } for m, mf in mappings.items(): mappings[m].set_index('#SampleID', inplace=True) if betas: betas = {f: DistanceMatrix.read(f) for f in betas} with joblib.parallel.Parallel(n_jobs=jobs, verbose=100) as par: par( joblib.delayed(_process_column)(bf, c, fname, finfo, alphas, betas, permutations) for bf, c, fname, finfo in _generate_betas( betas, mappings, permutations, output, overwrite)) else: alphas = { f: pd.read_csv(f, sep='\t', dtype=str, na_values=na_values) for f in alphas } for a, af in alphas.items(): alphas[a].set_index('#SampleID', inplace=True) for af, c, fname, finfo in _generate_alphas(alphas, mappings, output, overwrite): _process_column(af, c, fname, finfo, alphas, betas, permutations)
def guide_tree_from_sequences(sequences, metric=kmer_distance, display_tree = False): """ Build a UPGMA tree by applying metric to sequences Parameters ---------- sequences : list of skbio.Sequence objects (or subclasses) The sequences to be represented in the resulting guide tree. metric : function Function that returns a single distance value when given a pair of skbio.Sequence objects. display_tree : bool, optional Print the tree before returning. Returns ------- skbio.TreeNode """ guide_dm = DistanceMatrix.from_iterable( sequences, metric=metric, key='id') guide_lm = average(guide_dm.condensed_form()) guide_tree = to_tree(guide_lm) if display_tree: guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', link_color_func=lambda x: 'black') return guide_tree
def __init__(self, dist_matrix): self.dist_matrix = dist_matrix nr_elements = self.dist_matrix.nr_elements self.matrix = [] for i in range(nr_elements): row = [] for j in range(nr_elements): row.append(self.dist_matrix.get_distance(i, j)) self.matrix.append(row) self.ids = list(map(str, self.dist_matrix.labels)) self.nj_dm = DistanceMatrix(self.matrix, self.ids) tree = nj(self.nj_dm) self.ids = [] self.sources = [] self.targets = [] self.weights = [] self.colors = [] self.node_size = [] self.virtual_nodes = 0 self.shown_labels = {} self.font_colors = [] # true #00A693 -- false #CC3333 for node in tree.preorder(): name_str = '' if node.name is None: self.virtual_nodes = self.virtual_nodes + 1 name_str = 'v' + str(self.virtual_nodes) node.name = name_str self.ids.append(node.name) self.colors.append("black") self.node_size.append(20) self.shown_labels[str(name_str)] = "" self.font_colors.append('k') else: name = node.name.rsplit(' ', 1) if len(name) > 1: node.name = name[1] name2 = name[0].rsplit(' ', 1) if len(name2) > 1: node.name = name2[1] + name[1] name = node.name if name in []: self.ids.append(node.name) self.colors.append("#CC3333") self.node_size.append(800) name_str = node.name self.shown_labels[str(name_str)] = name_str else: self.ids.append(node.name) self.colors.append("#00A693") self.node_size.append(800) name_str = node.name self.shown_labels[str(name_str)] = name_str for node in tree.preorder(): for child in node.children: self.sources.append(str(node.name)) self.targets.append(str(child.name)) self.weights.append(str(child.length))
def test_varmat1(self): X = pd.DataFrame({'x': np.arange(1, 10), 'y': np.arange(2, 11)}) res = variation_matrix(X) exp = DistanceMatrix( [[0, 0.032013010420979787 / 2], [0.032013010420979787 / 2, 0]], ids=['x', 'y']) self.assertEqual(str(res), str(exp))
def test_euclidean(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available actual_dm = beta_diversity('euclidean', self.table1, self.sids1) self.assertEqual(actual_dm.shape, (3, 3)) npt.assert_almost_equal(actual_dm['A', 'A'], 0.0) npt.assert_almost_equal(actual_dm['B', 'B'], 0.0) npt.assert_almost_equal(actual_dm['C', 'C'], 0.0) npt.assert_almost_equal(actual_dm['A', 'B'], 2.23606798) npt.assert_almost_equal(actual_dm['B', 'A'], 2.23606798) npt.assert_almost_equal(actual_dm['A', 'C'], 4.12310563) npt.assert_almost_equal(actual_dm['C', 'A'], 4.12310563) npt.assert_almost_equal(actual_dm['B', 'C'], 2.82842712) npt.assert_almost_equal(actual_dm['C', 'B'], 2.82842712) actual_dm = beta_diversity('euclidean', self.table2, self.sids2) expected_data = [ [0., 80.8455317, 84.0297566, 36.3042697, 86.0116271, 78.9176786], [80.8455317, 0., 71.0844568, 74.4714710, 69.3397433, 14.422205], [84.0297566, 71.0844568, 0., 77.2851861, 8.3066238, 60.7536007], [36.3042697, 74.4714710, 77.2851861, 0., 78.7908624, 70.7389567], [86.0116271, 69.3397433, 8.3066238, 78.7908624, 0., 58.4807660], [78.9176786, 14.422205, 60.7536007, 70.7389567, 58.4807660, 0.] ] expected_dm = DistanceMatrix(expected_data, self.sids2) for id1 in self.sids2: for id2 in self.sids2: npt.assert_almost_equal(actual_dm[id1, id2], expected_dm[id1, id2], 6)
def test_simple(self): eigvals = [ 0.51236726, 0.30071909, 0.26791207, 0.20898868, 0.19169895, 0.16054235, 0.15017696, 0.12245775, 0.0 ] proportion_explained = [ 0.2675738328, 0.157044696, 0.1399118638, 0.1091402725, 0.1001110485, 0.0838401162, 0.0784269939, 0.0639511764, 0.0 ] sample_ids = [ 'PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634' ] axis_labels = ['PC%d' % i for i in range(1, 10)] expected_results = OrdinationResults( short_method_name='PCoA', long_method_name='Principal Coordinate Analysis', eigvals=pd.Series(eigvals, index=axis_labels), samples=pd.DataFrame(np.loadtxt( get_data_path('exp_PCoAEigenResults_site')), index=sample_ids, columns=axis_labels), proportion_explained=pd.Series(proportion_explained, index=axis_labels)) dm = DistanceMatrix.read(get_data_path('PCoA_sample_data_3')) results = pcoa(dm) assert_ordination_results_equal(results, expected_results, ignore_directionality=True)
def test_weighted_unifrac_partial_full(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = partial_beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, id_pairs=[('A', 'B'), ('A', 'C'), ('B', 'C')]) dm2 = beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [[0.0, 0.1750000, 0.12499999], [0.1750000, 0.0, 0.3000000], [0.12499999, 0.3000000, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def test_weighted_unifrac_normalized(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available # expected values calculated by hand dm1 = beta_diversity('weighted_unifrac', self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, normalized=True) dm2 = beta_diversity(weighted_unifrac, self.table1, self.sids1, otu_ids=self.oids1, tree=self.tree1, normalized=True) self.assertEqual(dm1.shape, (3, 3)) self.assertEqual(dm1, dm2) expected_data = [[0.0, 0.128834, 0.085714], [0.128834, 0.0, 0.2142857], [0.085714, 0.2142857, 0.0]] expected_dm = DistanceMatrix(expected_data, ids=self.sids1) for id1 in self.sids1: for id2 in self.sids1: npt.assert_almost_equal(dm1[id1, id2], expected_dm[id1, id2], 6)
def test_braycurtis(self): # TODO: update npt.assert_almost_equal calls to use DistanceMatrix # near-equality testing when that support is available actual_dm = beta_diversity('braycurtis', self.table1, self.sids1) self.assertEqual(actual_dm.shape, (3, 3)) npt.assert_almost_equal(actual_dm['A', 'A'], 0.0) npt.assert_almost_equal(actual_dm['B', 'B'], 0.0) npt.assert_almost_equal(actual_dm['C', 'C'], 0.0) npt.assert_almost_equal(actual_dm['A', 'B'], 0.27272727) npt.assert_almost_equal(actual_dm['B', 'A'], 0.27272727) npt.assert_almost_equal(actual_dm['A', 'C'], 0.71428571) npt.assert_almost_equal(actual_dm['C', 'A'], 0.71428571) npt.assert_almost_equal(actual_dm['B', 'C'], 0.66666667) npt.assert_almost_equal(actual_dm['C', 'B'], 0.66666667) actual_dm = beta_diversity('braycurtis', self.table2, self.sids2) expected_data = [ [0., 0.78787879, 0.86666667, 0.30927835, 0.85714286, 0.81521739], [0.78787879, 0., 0.78142077, 0.86813187, 0.75, 0.1627907], [0.86666667, 0.78142077, 0., 0.87709497, 0.09392265, 0.71597633], [0.30927835, 0.86813187, 0.87709497, 0., 0.87777778, 0.89285714], [0.85714286, 0.75, 0.09392265, 0.87777778, 0., 0.68235294], [0.81521739, 0.1627907, 0.71597633, 0.89285714, 0.68235294, 0.] ] expected_dm = DistanceMatrix(expected_data, self.sids2) for id1 in self.sids2: for id2 in self.sids2: npt.assert_almost_equal(actual_dm[id1, id2], expected_dm[id1, id2], 6)
def test_visualization_garbage_metadata(self): # tests the scenario where ndim > number of tips np.random.seed(0) num_otus = 10 # otus num_samples = 5 table = pd.DataFrame(np.random.random((num_samples, num_otus)), index=np.arange(num_samples).astype(np.str), columns=np.arange(num_otus).astype(np.str)) x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3 md = MetadataCategory( pd.Series(['a', 'a', 'a', 'b', 'b', 'foo', 'foo'], index=np.arange(7).astype(np.str))) dendrogram_heatmap(self.results, table, t, md) index_fp = os.path.join(self.results, 'index.html') self.assertTrue(os.path.exists(index_fp)) with open(index_fp, 'r') as fh: html = fh.read() self.assertIn('<h1>Dendrogram heatmap</h1>', html)
def setup(self): with open(get_data_path('PCoA_sample_data_3'), 'U') as lines: dist_matrix = DistanceMatrix.from_file(lines) self.ordination = PCoA(dist_matrix) self.ids = ['PC.636', 'PC.635', 'PC.356', 'PC.481', 'PC.354', 'PC.593', 'PC.355', 'PC.607', 'PC.634']
def get_spearmans(distfile, truth): distmat = DistanceMatrix.read(distfile) ids = list(sorted(distmat.ids)) distmat = distmat.filter(ids) dist = distmat.condensed_form() truth = truth.condensed_form() sp = stats.spearmanr(truth, dist) return sp.correlation
def test_unweighted_unifrac_qiime_tiny_test(self): dm_fp = get_data_path( os.path.join('qiime-191-tt', 'unweighted_unifrac_dm.txt'), 'data') expected = DistanceMatrix.read(dm_fp) for sid1 in self.q_table.columns: for sid2 in self.q_table.columns: actual = unweighted_unifrac( self.q_table[sid1], self.q_table[sid2], otu_ids=self.q_table.index, tree=self.q_tree) self.assertAlmostEqual(actual, expected[sid1, sid2])
def test_io(self): # Very basic check that read/write public API is present and appears to # be functioning. Roundtrip from memory -> disk -> memory and ensure # results match. fh = StringIO() self.dm_3x3.write(fh) fh.seek(0) deserialized = DistanceMatrix.read(fh) self.assertEqual(deserialized, self.dm_3x3) self.assertTrue(type(deserialized) == DistanceMatrix)
def test_from_iterable_with_keys(self): iterable = (x for x in range(4)) exp = DistanceMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]], ['0', '1', '4', '9']) res = DistanceMatrix.from_iterable(iterable, lambda a, b: abs(b - a), keys=iter(['0', '1', '4', '9'])) self.assertEqual(res, exp)
def setUp(self): np.random.seed(0) x = np.random.rand(10) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) ids = np.arange(len(x)).astype(np.str) self.tree = TreeNode.from_linkage_matrix(lm, ids) # initialize tree with branch length and named internal nodes for i, n in enumerate(self.tree.postorder(include_self=True)): n.length = 1 if not n.is_tip(): n.name = "y%d" % i
def test_varmat_larg(self): np.random.seed(123) D = 50 N = 100 mean = np.ones(D)*10 cov = np.eye(D) X = pd.DataFrame(np.abs(np.random.multivariate_normal(mean, cov, size=N)), columns=np.arange(D).astype(np.str)) res = variation_matrix(X) exp = DistanceMatrix.read(get_data_path('exp_varmat.txt')) self.assertEqual(str(res), str(exp))
def setUp(self): np.random.seed(0) self.table = pd.DataFrame(np.random.random((5, 5))) num_otus = 5 # otus x = np.random.rand(num_otus) dm = DistanceMatrix.from_iterable(x, lambda x, y: np.abs(x-y)) lm = ward(dm.condensed_form()) t = TreeNode.from_linkage_matrix(lm, np.arange(len(x)).astype(np.str)) self.tree = SquareDendrogram.from_tree(t) for i, n in enumerate(t.postorder()): if not n.is_tip(): n.name = "y%d" % i n.length = np.random.rand()*3
def adonis(output_dir: str, distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, formula: str, permutations: int = 999, n_jobs: str = 1) -> None: # Validate sample metadata is superset et cetera metadata_ids = set(metadata.ids) dm_ids = distance_matrix.ids _validate_metadata_is_superset(metadata_ids, set(dm_ids)) # filter ids. ids must be in same order as dm filtered_md = metadata.to_dataframe().reindex(dm_ids) filtered_md.index.name = 'sample-id' metadata = qiime2.Metadata(filtered_md) # Validate formula terms = ModelDesc.from_formula(formula) for t in terms.rhs_termlist: for i in t.factors: metadata.get_column(i.name()) # Run adonis results_fp = os.path.join(output_dir, 'adonis.tsv') with tempfile.TemporaryDirectory() as temp_dir_name: dm_fp = os.path.join(temp_dir_name, 'dm.tsv') distance_matrix.write(dm_fp) md_fp = os.path.join(temp_dir_name, 'md.tsv') metadata.save(md_fp) cmd = ['run_adonis.R', dm_fp, md_fp, formula, str(permutations), str(n_jobs), results_fp] _run_command(cmd) # Visualize results results = pd.read_csv(results_fp, sep='\t') results = q2templates.df_to_html(results) index = os.path.join(TEMPLATES, 'adonis_assets', 'index.html') q2templates.render(index, output_dir, context={'results': results})
def filter_distance_matrix(distance_matrix: skbio.DistanceMatrix, metadata: qiime2.Metadata, where: str=None, exclude_ids: bool=False) -> skbio.DistanceMatrix: ids_to_keep = metadata.ids(where=where) if exclude_ids: ids_to_keep = set(distance_matrix.ids) - set(ids_to_keep) # NOTE: there is no guaranteed ordering to output distance matrix because # `ids_to_keep` is a set, and `DistanceMatrix.filter` uses its iteration # order. try: return distance_matrix.filter(ids_to_keep, strict=False) except skbio.stats.distance.DissimilarityMatrixError: raise ValueError( "All samples were filtered out of the distance matrix.")
def rank_linkage(r, method='average'): r""" Hierchical Clustering on feature ranks. The hierarchy is built based on the rank values of the features given an input vector `r` of ranks. The distance between two features :math:`x` and :math:`y` can be defined as .. math:: d(x, y) = (r(x) - r(y))^2 Where :math:`r(x)` is the rank of the features. Hierarchical clustering is then performed using :math:`d(x, y)` as the distance metric. This can be useful for constructing principal balances. Parameters ---------- r : pd.Series Continuous vector representing some ordering of the features in X. method : str Clustering method. (default='average') Returns ------- skbio.TreeNode Tree for constructing principal balances. Examples -------- >>> import pandas as pd >>> from gneiss.cluster import rank_linkage >>> ranks = pd.Series([1, 2, 4, 5], ... index=['o1', 'o2', 'o3', 'o4']) >>> tree = rank_linkage(ranks) >>> print(tree.ascii_art()) /-o1 /y1------| | \-o2 -y0------| | /-o3 \y2------| \-o4 """ dm = DistanceMatrix.from_iterable(r, euclidean) lm = linkage(dm.condensed_form(), method) t = TreeNode.from_linkage_matrix(lm, r.index) t = rename_internal_nodes(t) return t