def setUp(self): self.dm1 = skbio.DistanceMatrix([[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]], ids=['sample1', 'sample2', 'sample3']) # Positive correlation with `dm1` self.dm2 = skbio.DistanceMatrix([[0.00, 1.00, 2.00], [1.00, 0.00, 1.00], [2.00, 1.00, 0.00]], ids=['sample1', 'sample2', 'sample3']) # Perfect negative correlation with `dm1` self.dm3 = skbio.DistanceMatrix([[0.00, 0.00, 0.00], [0.00, 0.00, 0.25], [0.00, 0.25, 0.00]], ids=['sample1', 'sample2', 'sample3']) self.dm2_reordered = skbio.DistanceMatrix( [[0.00, 2.00, 1.00], [2.00, 0.00, 1.00], [1.00, 1.00, 0.00]], ids=['sample3', 'sample1', 'sample2']) self.mismatched_dm = skbio.DistanceMatrix( [[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 2.0, 3.0], [0.0, 1.0, 0.0, 1.0, 2.0], [0.0, 2.0, 1.0, 0.0, 1.0], [0.0, 3.0, 2.0, 1.0, 0.0]], ids=['foo', 'sample1', 'sample2', 'sample3', 'x']) self.output_dir_obj = tempfile.TemporaryDirectory( prefix='q2-diversity-test-temp-') self.output_dir = self.output_dir_obj.name
def setUp(self): self.dm = skbio.DistanceMatrix([[0, 1, 2.1], [1, 0, 3], [2.1, 3, 0]], ids=['S1', 'S2', 'S3']) # Since support is traditionally held as the name, we'll use only two # trees since 1/2 has an exact floating point representation and will # look like `"0.5"` on any machine. self.support = [ skbio.DistanceMatrix([[0, 1.1, 2], [1.1, 0, 3], [2, 3, 0]], ids=['S1', 'S2', 'S3']), skbio.DistanceMatrix([[0, 2, 3.1], [2, 0, 1], [3.1, 1, 0]], ids=['S1', 'S2', 'S3']) ]
def test_without_where_some_filtered(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1'], 'SampleType': ['gut', 'tongue']}, index=pd.Index(['S1', 'S2'], name='id')) metadata = qiime2.Metadata(df) dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]], ['S1', 'S2', 'S3']) filtered = filter_distance_matrix(dm, metadata) expected = skbio.DistanceMatrix([[0, 1], [1, 0]], ['S1', 'S2']) self.assertEqual(self._sorted(filtered), expected)
def test_simple(self): dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]], ids=['S1', 'S2', 'S3']) j1 = skbio.DistanceMatrix([[0, 1.1, 2], [1.1, 0, 3], [2, 3, 0]], ids=['S1', 'S2', 'S3']) j2 = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3.1], [2, 3.1, 0]], ids=['S1', 'S2', 'S3']) j3 = skbio.DistanceMatrix([[0, 1.1, 1.9], [1.1, 0, 3], [1.9, 3, 0]], ids=['S1', 'S2', 'S3']) e = _jackknifed_emperor(dm, [j1, j2, j3], self.md) self.assertEqual(len(e.jackknifed), 3)
def test_with_exclude_ids_filter_two(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1'], 'SampleType': ['gut', 'tongue']}, index=['S1', 'S2']) metadata = qiime2.Metadata(df) dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]], ['S1', 'S2', 'S3']) filtered = filter_distance_matrix(dm, metadata, where=None, exclude_ids=True) expected = skbio.DistanceMatrix([[0]], ['S3']) self.assertEqual(self._sorted(filtered), expected)
def test_with_where_some_filtered(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['gut', 'tongue', 'gut']}, index=['S1', 'S2', 'S3']) metadata = qiime2.Metadata(df) dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]], ['S1', 'S2', 'S3']) filtered = filter_distance_matrix(dm, metadata, where="Subject='subject-2'") expected = skbio.DistanceMatrix([[0]], ['S3']) self.assertEqual(filtered, expected)
def distance_matrix(metadata: qiime2.MetadataCategory) -> skbio.DistanceMatrix: try: series = pd.to_numeric(metadata.to_series(), errors='raise') except ValueError as e: raise ValueError( "Encountered non-numeric values in the metadata cateogry. A " "distance matrix can only be computed from numeric metadata. " "Original error message:\n\n%s" % e) # TODO this check can be removed when MetadataCategory is no longer allowed # to be empty if series.empty: raise ValueError( "Encountered metadata category that is empty, i.e. there are no " "samples or features in the metadata to compute distances " "between.") if series.hasnans: raise ValueError( "Encountered missing value(s) in the metadata category. Computing " "a distance matrix from missing values is not supported.") # This code is derived from @jairideout's scikit-bio cookbook recipe, # "Exploring Microbial Community Diversity" # https://github.com/biocore/scikit-bio-cookbook distances = scipy.spatial.distance.pdist(series.values[:, np.newaxis], metric='euclidean') return skbio.DistanceMatrix(distances, ids=series.index)
def compare(min_hash_signature: MinHashSigJsonDirFormat, ksize: int, ignore_abundance: bool = True) -> skbio.DistanceMatrix: np_file = 'tmp' label_file = 'tmp.labels.txt' command = [ 'sourmash', 'compare', str(min_hash_signature) + "/*", '--ksize', str(ksize), '-o', 'tmp' ] if ignore_abundance: command.append('--ignore-abundance') subprocess.run(' '.join(command), check=True, shell=True) # load np_file as np.ndarray -> np_sim np_sim = numpy.load(np_file) # convert similarity to distance np_dis = 1 - np_sim # read labels into a list -> labels labels = [ os.path.basename(filename).strip().strip('.fastq.gz') for filename in open(label_file) ] os.remove(np_file) os.remove(label_file) return skbio.DistanceMatrix(np_dis, labels)
def setUp(self): super().setUp() # expected computed with diversity.beta_phylogenetic (weighted_unifrac) self.expected = skbio.DistanceMatrix( np.array([0.44656238, 0.23771096, 0.30489123, 0.23446002, 0.65723575, 0.44911772, 0.381904, 0.69144829, 0.39611776, 0.36568012, 0.53377975, 0.48908025, 0.35155196, 0.28318669, 0.57376916, 0.23395746, 0.24658122, 0.60271637, 0.39802552, 0.36567394, 0.68062701, 0.36862049, 0.48350632, 0.33024631, 0.33266697, 0.53464744, 0.74605075, 0.53951035, 0.49680733, 0.79178838, 0.37109012, 0.52629343, 0.22118218, 0.32400805, 0.43189708, 0.59705893]), ids=('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355', '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607', '10084.PC.634')) table_fp = self.get_data_path('crawford.biom') self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r') rel_freq_table_fp = self.get_data_path('crawford_rf.biom') self.rf_table_as_BIOMV210Format = BIOMV210Format(rel_freq_table_fp, mode='r') tree_fp = self.get_data_path('crawford.nwk') self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r')
def setUp(self): super().setUp() # expected computed with skbio.diversity.beta_diversity self.expected = skbio.DistanceMatrix([[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]], ids=['S1', 'S2', 'S3']) table_fp = self.get_data_path('two_feature_table.biom') self.table_as_BIOMV210Format = BIOMV210Format(table_fp, mode='r') rf_table_fp = self.get_data_path('two_feature_rf_table.biom') self.rf_table_as_BIOMV210Format = BIOMV210Format(rf_table_fp, mode='r') p_a_table_fp = self.get_data_path('two_feature_p_a_table.biom') self.p_a_table_as_BIOMV210Format = BIOMV210Format(p_a_table_fp, mode='r') self.table_as_artifact = Artifact.import_data( 'FeatureTable[Frequency]', self.table_as_BIOMV210Format) tree_fp = self.get_data_path('three_feature.tree') self.tree_as_NewickFormat = NewickFormat(tree_fp, mode='r') self.tree_as_artifact = Artifact.import_data( 'Phylogeny[Rooted]', self.tree_as_NewickFormat) self.unweighted_unifrac_thru_framework = self.plugin.actions[ 'unweighted_unifrac']
def test_generalized_unifrac(self): bt_fp = self.get_data_path('vaw.biom') tree_fp = self.get_data_path('vaw.nwk') actual = beta_phylogenetic(table=bt_fp, phylogeny=tree_fp, metric='generalized_unifrac', alpha=0.5) data = np.array([[0.0000000, 0.4040518, 0.6285560, 0.5869439, 0.4082483, 0.2995673], [0.4040518, 0.0000000, 0.4160597, 0.7071068, 0.7302479, 0.4860856], [0.6285560, 0.4160597, 0.0000000, 0.8005220, 0.9073159, 0.5218198], [0.5869439, 0.7071068, 0.8005220, 0.0000000, 0.4117216, 0.3485667], [0.4082483, 0.7302479, 0.9073159, 0.4117216, 0.0000000, 0.6188282], [0.2995673, 0.4860856, 0.5218198, 0.3485667, 0.6188282, 0.0000000]]) ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_permanova_pairwise(self): dm = skbio.DistanceMatrix( [[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]], ids=['sample1', 'sample2', 'sample3']) md = qiime2.MetadataCategory( pd.Series(['a', 'b', 'b'], name='a or b', index=['sample1', 'sample2', 'sample3'])) with tempfile.TemporaryDirectory() as output_dir: beta_group_significance(output_dir, dm, md, pairwise=True) index_fp = os.path.join(output_dir, 'index.html') self.assertTrue(os.path.exists(index_fp)) # all expected boxplots are generated self.assertTrue( os.path.exists(os.path.join(output_dir, 'a-boxplots.pdf'))) self.assertTrue( os.path.exists(os.path.join(output_dir, 'a-boxplots.png'))) self.assertTrue( os.path.exists(os.path.join(output_dir, 'b-boxplots.pdf'))) self.assertTrue( os.path.exists(os.path.join(output_dir, 'b-boxplots.png'))) # no extra boxplots are generated self.assertEqual(len(glob.glob('%s/*-boxplots.pdf' % output_dir)), 2) self.assertEqual(len(glob.glob('%s/*-boxplots.png' % output_dir)), 2) self.assertTrue('PERMANOVA results' in open(index_fp).read()) self.assertTrue('Pairwise permanova' in open(index_fp).read()) self.assertFalse('Warning' in open(index_fp).read())
def _metadata_distance(metadata: pd.Series) -> skbio.DistanceMatrix: # This code is derived from @jairideout's scikit-bio cookbook recipe, # "Exploring Microbial Community Diversity" # https://github.com/biocore/scikit-bio-cookbook distances = scipy.spatial.distance.pdist(metadata.values[:, numpy.newaxis], metric='euclidean') return skbio.DistanceMatrix(distances, ids=metadata.index)
def _bootstrap_dm(ids, dm, new_names=None): """Makes a bootstrapped distance matrix Parameters ---------- ids: array-like A list of ids in the distance matrix. These do not have to be unique. dm : DistanceMatrix The distance matrix object to resample. new_names: array_like, optional The names to be used in the new array. Note, this must be unique. If nothing is specified, a numeric index will be used. Returns ------- A DistanceMatrix with the samples above and the index names """ if new_names is None: new_names = np.arange(0, len(ids)) dm_ids = dm.ids id_pos = [dm_ids.index(id_) for id_ in ids] dm_data = dm.data[id_pos][:, id_pos] return skbio.DistanceMatrix(dm_data, new_names)
def test_with_exclude_ids_where_filter_two(self): df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'], 'SampleType': ['elbow', 'tongue', 'gut']}, index=pd.Index(['S1', 'S2', 'S3'], name='id')) metadata = qiime2.Metadata(df) dm = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 3], [2, 3, 0]], ['S1', 'S2', 'S3']) where = "SampleType='tongue' OR SampleType='gut'" filtered = filter_distance_matrix(dm, metadata, where, exclude_ids=True) expected = skbio.DistanceMatrix([[0]], ['S1']) self.assertEqual(filtered, expected)
def test_variance_adjusted_normalized(self): bt_fp = self.get_data_path('vaw.biom') tree_fp = self.get_data_path('vaw.nwk') actual = beta_phylogenetic(table=bt_fp, phylogeny=tree_fp, metric='weighted_normalized_unifrac', variance_adjusted=True) data = np.array([[0.0000000, 0.4086040, 0.6240185, 0.4639481, 0.2857143, 0.2766318], [0.4086040, 0.0000000, 0.3798594, 0.6884992, 0.6807616, 0.4735781], [0.6240185, 0.3798594, 0.0000000, 0.7713254, 0.8812897, 0.5047114], [0.4639481, 0.6884992, 0.7713254, 0.0000000, 0.6666667, 0.2709298], [0.2857143, 0.6807616, 0.8812897, 0.6666667, 0.0000000, 0.4735991], [0.2766318, 0.4735781, 0.5047114, 0.2709298, 0.4735991, 0.0000000]]) ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def cscs(features: biom.Table, css_edges: str, cosine_threshold: float = 0.6, normalization: bool = True, weighted: bool = True) -> skbio.DistanceMatrix: observationids = { x: index for index, x in enumerate(features.ids(axis='observation')) } edgesdok = dok_matrix((features.shape[0], features.shape[0]), dtype=np.float32) for line in open(css_edges, "r"): if line.find("CLUSTERID1") > -1: continue linesplit = line.split("\t") if float(linesplit[4]) < cosine_threshold: edgesdok[observationids[linesplit[0]], observationids[linesplit[1]]] = 0.0 else: edgesdok[observationids[linesplit[0]], observationids[linesplit[1]]] = float(linesplit[4]) edgesdok[observationids[linesplit[1]], observationids[linesplit[0]]] = float(linesplit[4]) edgesdok.setdiag(1) if normalization: features = features.norm(axis='sample', inplace=False) if weighted == False: features = features.pa #TODO: make new option in cscs() sample_names = features.ids() cscs = parallel_make_distance_matrix(features, edgesdok, sample_names) cscs = 1 - cscs print(cscs) return (skbio.DistanceMatrix(cscs, ids=cscs.index))
def test_2nn(self): # -- setup -- # # 2 nearest neighbors of each sample are # f1: s1, s2 (classified as skinny) # s1: f1, s2 (closer to f1 so fat) # s2: f1, (s1 or s3) (closer to f1 so fat) # s3: s1, s2 (skinny) sample_ids = ('f1', 's1', 's2', 's3') distance_matrix = skbio.DistanceMatrix([ [0, 2, 1, 5], [2, 0, 3, 4], [1, 3, 0, 3], [5, 4, 3, 0], ], ids=sample_ids) dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix) categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'), index=sample_ids, name='body_mass') categories.index.name = 'SampleID' metadata = qiime2.CategoricalMetadataColumn(categories) # -- test -- # res = sample_classifier.actions.classify_samples_from_dist( distance_matrix=dm, metadata=metadata, k=2, cv=3, random_state=123) pred = res[0].view(pd.Series) expected = pd.Series(('skinny', 'fat', 'fat', 'skinny'), index=sample_ids) self.assertTrue(expected.sort_index().equals(pred.sort_index()))
def test_beta_unweighted(self): bt_fp = self.get_data_path('crawford.biom') tree_fp = self.get_data_path('crawford.nwk') actual = beta_phylogenetic(table=bt_fp, phylogeny=tree_fp, metric='unweighted_unifrac') # computed with beta-phylogenetic data = np.array([0.71836067, 0.71317361, 0.69746044, 0.62587207, 0.72826674, 0.72065895, 0.72640581, 0.73606053, 0.70302967, 0.73407301, 0.6548042, 0.71547381, 0.78397813, 0.72318399, 0.76138933, 0.61041275, 0.62331299, 0.71848305, 0.70416337, 0.75258475, 0.79249029, 0.64392779, 0.70052733, 0.69832716, 0.77818938, 0.72959894, 0.75782689, 0.71005144, 0.75065046, 0.78944369, 0.63593642, 0.71283615, 0.58314638, 0.69200762, 0.68972056, 0.71514083]) ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355', '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607', '10084.PC.634') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_classify_samples_from_dist(self): # -- setup -- # # 1,2 are a group, 3,4 are a group sample_ids = ('f1', 'f2', 's1', 's2') distance_matrix = skbio.DistanceMatrix([ [0, 1, 4, 4], [1, 0, 4, 4], [4, 4, 0, 1], [4, 4, 1, 0], ], ids=sample_ids) dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix) categories = pd.Series(('skinny', 'skinny', 'fat', 'fat'), index=sample_ids[::-1], name='body_mass') categories.index.name = 'SampleID' metadata = qiime2.CategoricalMetadataColumn(categories) # -- test -- # res = sample_classifier.actions.classify_samples_from_dist( distance_matrix=dm, metadata=metadata, k=1, cv=3, random_state=123) pred = res[0].view(pd.Series).sort_values() expected = pd.Series(('fat', 'skinny', 'fat', 'skinny'), index=['f1', 's1', 'f2', 's2']) not_expected = pd.Series(('fat', 'fat', 'fat', 'skinny'), index=sample_ids) # order matters for pd.Series.equals() self.assertTrue(expected.sort_index().equals(pred.sort_index())) self.assertFalse(not_expected.sort_index().equals(pred.sort_index()))
def test_classify_samples_from_dist_with_group_of_single_item(self): # -- setup -- # # 1 is a group, 2,3,4 are a group sample_ids = ('f1', 's1', 's2', 's3') distance_matrix = skbio.DistanceMatrix([ [0, 2, 3, 3], [2, 0, 1, 1], [3, 1, 0, 1], [3, 1, 1, 0], ], ids=sample_ids) dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix) categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'), index=sample_ids, name='body_mass') categories.index.name = 'SampleID' metadata = qiime2.CategoricalMetadataColumn(categories) # -- test -- # res = sample_classifier.actions.classify_samples_from_dist( distance_matrix=dm, metadata=metadata, k=1, cv=3, random_state=123) pred = res[0].view(pd.Series) expected = pd.Series(('skinny', 'skinny', 'skinny', 'skinny'), index=sample_ids) self.assertTrue(expected.sort_index().equals(pred.sort_index()))
def test_generalized_unifrac_no_alpha(self): actual = self.beta_phylogenetic(table=self.crawford_table, phylogeny=self.crawford_tree, metric='generalized_unifrac', alpha=None) # alpha=1 should be equal to weighted normalized UniFrac data = np.array([0.2821874, 0.16148405, 0.20186143, 0.1634832, 0.40351108, 0.29135056, 0.24790944, 0.41967404, 0.24642185, 0.22218489, 0.34007547, 0.27722011, 0.20963881, 0.16897221, 0.3217958, 0.15237816, 0.16899207, 0.36445044, 0.25408941, 0.23358681, 0.4069374, 0.24615927, 0.28573888, 0.20578184, 0.20742006, 0.31249151, 0.46169893, 0.35294595, 0.32522355, 0.48437103, 0.21534558, 0.30558908, 0.12091004, 0.19817777, 0.24792853, 0.34293674]) ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355', '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607', '10084.PC.634') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(len(actual), 1) self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix') actual = actual[0].view(skbio.DistanceMatrix) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_single_tree_and_label(self): trees = [skbio.TreeNode.read(['(A:0.2, B:1.5, C, (E, F));'])] expected = skbio.DistanceMatrix([[0]], ids=['foo']) result = robinson_foulds(trees, labels=['foo']) self.assertEqual(result, expected)
def test_anosim_pairwise(self): dm = skbio.DistanceMatrix([[0.00, 0.25, 0.25], [0.25, 0.00, 0.00], [0.25, 0.00, 0.00]], ids=['sample1', 'sample2', 'sample3']) md = qiime2.CategoricalMetadataColumn( pd.Series(['a', 'b', 'b'], name='a or b', index=pd.Index(['sample1', 'sample2', 'sample3'], name='id'))) with tempfile.TemporaryDirectory() as output_dir: beta_group_significance(output_dir, dm, md, method='anosim', permutations=42, pairwise=True) index_fp = os.path.join(output_dir, 'index.html') self.assertTrue(os.path.exists(index_fp)) # all expected boxplots are generated self.assertTrue(os.path.exists( os.path.join(output_dir, 'a-boxplots.pdf'))) self.assertTrue(os.path.exists( os.path.join(output_dir, 'a-boxplots.png'))) self.assertTrue(os.path.exists( os.path.join(output_dir, 'b-boxplots.pdf'))) self.assertTrue(os.path.exists( os.path.join(output_dir, 'b-boxplots.png'))) # no extra boxplots are generated self.assertEqual(len(glob.glob('%s/*-boxplots.pdf' % output_dir)), 2) self.assertEqual(len(glob.glob('%s/*-boxplots.png' % output_dir)), 2) self.assertTrue('ANOSIM results' in open(index_fp).read()) self.assertTrue('<td>42</td>' in open(index_fp).read()) self.assertFalse('Warning' in open(index_fp).read()) self.assertTrue('Pairwise anosim' in open(index_fp).read())
def test_generalized_unifrac(self): bt_fp = self.get_data_path('vaw.biom') bt = Artifact.import_data('FeatureTable[Frequency]', bt_fp) tree_fp = self.get_data_path('vaw.nwk') tree = Artifact.import_data('Phylogeny[Rooted]', tree_fp) actual = self.beta_phylogenetic(table=bt, phylogeny=tree, metric='generalized_unifrac', alpha=0.5) data = np.array([[0.0000000, 0.4040518, 0.6285560, 0.5869439, 0.4082483, 0.2995673], [0.4040518, 0.0000000, 0.4160597, 0.7071068, 0.7302479, 0.4860856], [0.6285560, 0.4160597, 0.0000000, 0.8005220, 0.9073159, 0.5218198], [0.5869439, 0.7071068, 0.8005220, 0.0000000, 0.4117216, 0.3485667], [0.4082483, 0.7302479, 0.9073159, 0.4117216, 0.0000000, 0.6188282], [0.2995673, 0.4860856, 0.5218198, 0.3485667, 0.6188282, 0.0000000]]) ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(len(actual), 1) self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix') actual = actual[0].view(skbio.DistanceMatrix) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_variance_adjusted_normalized(self): bt_fp = self.get_data_path('vaw.biom') bt = Artifact.import_data('FeatureTable[Frequency]', bt_fp) tree_fp = self.get_data_path('vaw.nwk') tree = Artifact.import_data('Phylogeny[Rooted]', tree_fp) actual = self.beta_phylogenetic(table=bt, phylogeny=tree, metric='weighted_normalized_unifrac', variance_adjusted=True) data = np.array([[0.0000000, 0.4086040, 0.6240185, 0.4639481, 0.2857143, 0.2766318], [0.4086040, 0.0000000, 0.3798594, 0.6884992, 0.6807616, 0.4735781], [0.6240185, 0.3798594, 0.0000000, 0.7713254, 0.8812897, 0.5047114], [0.4639481, 0.6884992, 0.7713254, 0.0000000, 0.6666667, 0.2709298], [0.2857143, 0.6807616, 0.8812897, 0.6666667, 0.0000000, 0.4735991], [0.2766318, 0.4735781, 0.5047114, 0.2709298, 0.4735991, 0.0000000]]) ids = ('Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(len(actual), 1) self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix') actual = actual[0].view(skbio.DistanceMatrix) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_beta_weighted(self): actual = self.beta_phylogenetic(table=self.crawford_table, phylogeny=self.crawford_tree, metric='weighted_unifrac') # computed with beta-phylogenetic (weighted_unifrac) data = np.array([0.44656238, 0.23771096, 0.30489123, 0.23446002, 0.65723575, 0.44911772, 0.381904, 0.69144829, 0.39611776, 0.36568012, 0.53377975, 0.48908025, 0.35155196, 0.28318669, 0.57376916, 0.23395746, 0.24658122, 0.60271637, 0.39802552, 0.36567394, 0.68062701, 0.36862049, 0.48350632, 0.33024631, 0.33266697, 0.53464744, 0.74605075, 0.53951035, 0.49680733, 0.79178838, 0.37109012, 0.52629343, 0.22118218, 0.32400805, 0.43189708, 0.59705893]) ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355', '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607', '10084.PC.634') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(len(actual), 1) self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix') actual = actual[0].view(skbio.DistanceMatrix) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_beta_unweighted_parallel(self): bt_fp = self.get_data_path('crawford.biom') bt = Artifact.import_data('FeatureTable[Frequency]', bt_fp) tree_fp = self.get_data_path('crawford.nwk') tree = Artifact.import_data('Phylogeny[Rooted]', tree_fp) actual = self.beta_phylogenetic(table=bt, phylogeny=tree, metric='unweighted_unifrac', threads=2) # computed with beta-phylogenetic data = np.array([0.71836067, 0.71317361, 0.69746044, 0.62587207, 0.72826674, 0.72065895, 0.72640581, 0.73606053, 0.70302967, 0.73407301, 0.6548042, 0.71547381, 0.78397813, 0.72318399, 0.76138933, 0.61041275, 0.62331299, 0.71848305, 0.70416337, 0.75258475, 0.79249029, 0.64392779, 0.70052733, 0.69832716, 0.77818938, 0.72959894, 0.75782689, 0.71005144, 0.75065046, 0.78944369, 0.63593642, 0.71283615, 0.58314638, 0.69200762, 0.68972056, 0.71514083]) ids = ('10084.PC.481', '10084.PC.593', '10084.PC.356', '10084.PC.355', '10084.PC.354', '10084.PC.636', '10084.PC.635', '10084.PC.607', '10084.PC.634') expected = skbio.DistanceMatrix(data, ids=ids) self.assertEqual(len(actual), 1) self.assertEqual(repr(actual.distance_matrix.type), 'DistanceMatrix') actual = actual[0].view(skbio.DistanceMatrix) self.assertEqual(actual.ids, expected.ids) for id1 in actual.ids: for id2 in actual.ids: npt.assert_almost_equal(actual[id1, id2], expected[id1, id2])
def test_metadata_distance_int(self): md = pd.Series([1, 2, 3], name='number', index=['sample1', 'sample2', 'sample3']) exp = skbio.DistanceMatrix([[0, 1, 2], [1, 0, 1], [2, 1, 0]], ids=['sample1', 'sample2', 'sample3']) obs = _metadata_distance(md) self.assertEqual(exp, obs)
def test_one_sample(self): md = qiime2.MetadataCategory( pd.Series([1.5], name='number', index=['sample1'])) exp = skbio.DistanceMatrix([[0.0]], ids=['sample1']) obs = distance_matrix(md) self.assertEqual(exp, obs)