Пример #1
0
def generate_biom_table(seqs_fp,
                        uc_fp,
                        delim='_'):
    """Generate BIOM table and representative FASTA set

    Parameters
    ----------
    seqs_fp: string
        file path to deblurred sequences
    uc_fp: string
        file path to dereplicated sequences map (.uc format)
    delim: string, optional
        delimiter for splitting sample and sequence IDs in sequence label
        default: '_'

    Returns
    -------
    deblur_clusters: dictionary
        dictionary of clusters including dereplicated sequence labels
    Table: biom.table
        an instance of a BIOM table
    """
    # parse clusters in dereplicated sequences map (.uc format)
    with open(uc_fp, 'U') as uc_f:
        derep_clusters, failures, seeds = clusters_from_uc_file(uc_f)
    # parse clusters in deblur file, set observation ID to be the sequence
    deblur_clusters = parse_deblur_output(seqs_fp, derep_clusters)
    # create sparse dictionary of observation and sample ID counts
    data, otu_ids, sample_ids = generate_biom_data(deblur_clusters, delim)
    # build BIOM table
    return deblur_clusters, Table(data, otu_ids, sample_ids,
                                  observation_metadata=None,
                                  sample_metadata=None, table_id=None,
                                  generated_by="deblur",
                                  create_date=datetime.now().isoformat())
Пример #2
0
    def test_clusters_from_uc_file(self):
        """ clusters_from_uc_file functions as expected """

        expected_clusters = {'s2': ['s2', 's3']}
        expected_failures = ['s1']
        expected_new_seeds = ['s2']
        self.assertEqual(clusters_from_uc_file(self.uc_lines1),
                         (expected_clusters, expected_failures, expected_new_seeds))
Пример #3
0
    def test_clusters_from_uc_file_multiple_hits(self):
        """ clusters_from_uc_file handles error_on_multiple_hits correctly
        """
        # when a query hits multiple hits and error_on_multiple_hits=True
        # an error should be raised
        self.assertRaises(UclustParseError,
                          clusters_from_uc_file,
                          self.uc_lines_w_multiple_hits_per_query,
                          error_on_multiple_hits=True)

        # when a query hits multiple hits and error_on_multiple_hits=False
        # the query should show up in multiple clusters
        actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query,
                                       error_on_multiple_hits=False)
        expected_clusters = {'s2': ['s2', 's3'],
                             's4': ['s4', 's3']}
        expected_failures = ['s1']
        expected_new_seeds = ['s2', 's4']
        self.assertEqual(actual,
                         (expected_clusters, expected_failures, expected_new_seeds))
Пример #4
0
def generate_biom_table(seqs_fp, uc_fp, delim='_'):
    """Generate BIOM table and representative FASTA set

    Parameters
    ----------
    seqs_fp: string
        file path to deblurred sequences
    uc_fp: string
        file path to dereplicated sequences map (.uc format)
    delim: string, optional
        delimiter for splitting sample and sequence IDs in sequence label
        default: '_'

    Returns
    -------
    deblur_clusters: dictionary
        dictionary of clusters including dereplicated sequence labels
    Table: biom.table
        an instance of a BIOM table
    """
    # parse clusters in dereplicated sequences map (.uc format)
    with open(uc_fp, 'U') as uc_f:
        derep_clusters, failures, seeds = clusters_from_uc_file(uc_f)
    # parse clusters in deblur file, set observation ID to be the sequence
    deblur_clusters = parse_deblur_output(seqs_fp, derep_clusters)
    # create sparse dictionary of observation and sample ID counts
    data, otu_ids, sample_ids = generate_biom_data(deblur_clusters, delim)
    # build BIOM table
    return deblur_clusters, Table(data,
                                  otu_ids,
                                  sample_ids,
                                  observation_metadata=None,
                                  sample_metadata=None,
                                  table_id=None,
                                  generated_by="deblur",
                                  create_date=datetime.now().isoformat())