def test_97_percent_clustering_feature4_most_abundant(self): input_table = biom.Table(np.array([[4, 5, 6], [1, 1, 2], [7, 8, 9], [100, 101, 103]]), ['feature1', 'feature2', 'feature3', 'feature4'], ['sample1', 'sample2', 'sample3']) exp_table = biom.Table(np.array([[111, 114, 118], [1, 1, 2]]), ['feature4', 'feature2'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = cluster_features_de_novo( sequences=self.input_sequences, table=input_table, perc_identity=0.97) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) # sequences are reverse-sorted by abundance in output obs_seqs = _read_seqs(obs_sequences) exp_seqs = [self.input_sequences_list[3], self.input_sequences_list[1]] self.assertEqual(obs_seqs, exp_seqs)
def test_extra_features_in_sequences(self): input_table = biom.Table(np.array([[0, 1, 3], [1, 1, 2], [4, 5, 6]]), ['feature1', 'feature2', 'feature3'], ['sample1', 'sample2', 'sample3']) with self.assertRaisesRegex(ValueError, expected_regex='Feature feature4 is pre'): clustered_table, clustered_sequences = cluster_features_de_novo( sequences=self.input_sequences, table=input_table, perc_identity=1.0)
def test_no_overlapping_feature_ids(self): input_table = biom.Table( np.array([[0, 1, 3], [1, 1, 2], [4, 5, 6], [7, 8, 9], [1, 1, 1]]), ['f1', 'f2', 'f3', 'f4', 'f5'], ['sample1', 'sample2', 'sample3']) with self.assertRaisesRegex(ValueError, expected_regex='Feature feature1 is pre'): clustered_table, clustered_sequences = cluster_features_de_novo( sequences=self.input_sequences, table=input_table, perc_identity=1.0)
def test_no_clustering(self): with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = cluster_features_de_novo( sequences=self.input_sequences, table=self.input_table, perc_identity=1.0) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(self.input_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, self.input_table) obs_seqs = _read_seqs(obs_sequences) # sequences are reverse-sorted by abundance in output exp_seqs = [self.input_sequences_list[0], self.input_sequences_list[3], self.input_sequences_list[2], self.input_sequences_list[1]] self.assertEqual(obs_seqs, exp_seqs)
def test_1_percent_clustering(self): exp_table = biom.Table(np.array([[112, 115, 120]]), ['feature1'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = cluster_features_de_novo( sequences=self.input_sequences, table=self.input_table, perc_identity=0.01) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation') self.assertEqual(obs_table, exp_table) # sequences are reverse-sorted by abundance in output obs_seqs = _read_seqs(obs_sequences) exp_seqs = [self.input_sequences_list[0]] self.assertEqual(obs_seqs, exp_seqs)
def test_short_sequences(self): input_sequences_fp = self.get_data_path('dna-sequences-short.fasta') input_sequences = DNAFASTAFormat(input_sequences_fp, mode='r') input_table = biom.Table(np.array([[0, 1, 3], [1, 1, 2]]), ['feature1', 'feature2'], ['sample1', 'sample2', 'sample3']) exp_table = biom.Table(np.array([[1, 2, 5]]), ['feature1'], ['sample1', 'sample2', 'sample3']) with redirected_stdio(stderr=os.devnull): obs_table, obs_sequences = cluster_features_de_novo( sequences=input_sequences, table=input_table, perc_identity=0.01) # order of identifiers is important for biom.Table equality obs_table = \ obs_table.sort_order(exp_table.ids(axis='observation'), axis='observation')