def generate_biom_table(seqs_fp, uc_fp, delim='_'): """Generate BIOM table and representative FASTA set Parameters ---------- seqs_fp: string file path to deblurred sequences uc_fp: string file path to dereplicated sequences map (.uc format) delim: string, optional delimiter for splitting sample and sequence IDs in sequence label default: '_' Returns ------- deblur_clusters: dictionary dictionary of clusters including dereplicated sequence labels Table: biom.table an instance of a BIOM table """ # parse clusters in dereplicated sequences map (.uc format) with open(uc_fp, 'U') as uc_f: derep_clusters, failures, seeds = clusters_from_uc_file(uc_f) # parse clusters in deblur file, set observation ID to be the sequence deblur_clusters = parse_deblur_output(seqs_fp, derep_clusters) # create sparse dictionary of observation and sample ID counts data, otu_ids, sample_ids = generate_biom_data(deblur_clusters, delim) # build BIOM table return deblur_clusters, Table(data, otu_ids, sample_ids, observation_metadata=None, sample_metadata=None, table_id=None, generated_by="deblur", create_date=datetime.now().isoformat())
def test_clusters_from_uc_file(self): """ clusters_from_uc_file functions as expected """ expected_clusters = {'s2': ['s2', 's3']} expected_failures = ['s1'] expected_new_seeds = ['s2'] self.assertEqual(clusters_from_uc_file(self.uc_lines1), (expected_clusters, expected_failures, expected_new_seeds))
def test_clusters_from_uc_file_multiple_hits(self): """ clusters_from_uc_file handles error_on_multiple_hits correctly """ # when a query hits multiple hits and error_on_multiple_hits=True # an error should be raised self.assertRaises(UclustParseError, clusters_from_uc_file, self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=True) # when a query hits multiple hits and error_on_multiple_hits=False # the query should show up in multiple clusters actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=False) expected_clusters = {'s2': ['s2', 's3'], 's4': ['s4', 's3']} expected_failures = ['s1'] expected_new_seeds = ['s2', 's4'] self.assertEqual(actual, (expected_clusters, expected_failures, expected_new_seeds))