Пример #1
0
    def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, silva_ssu_ref,
            silva_lsu_ref, ssu_blast_table, lsu_blast_table, output_dir):
        """Create table assigning GTDB taxonomy to SILVA accessions based on SSU and LSU BLAST results."""

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # read GTDB taxonomy
        print('Reading GTDB taxonomy.')
        gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file)
        gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file)
        gtdb_taxonomy = gtdb_bac_taxonomy.copy()
        gtdb_taxonomy.update(gtdb_ar_taxonomy)

        print('Identified %d bacterial genomes to process.' %
              len(gtdb_bac_taxonomy))
        print('Identified %d archaeal genomes to process.' %
              len(gtdb_ar_taxonomy))
        print('Identified %d genomes to process.' % len(gtdb_taxonomy))

        # read SILVA taxonomy
        print('Reading SILVA 16S and 23S rRNA taxonomies.')
        silva_ssu_taxonomy = {}
        for seq_id, seq, taxonomy in seq_io.read_seq(silva_ssu_ref,
                                                     keep_annotation=True):
            silva_ssu_taxonomy[seq_id] = taxonomy

        silva_lsu_taxonomy = {}
        for seq_id, seq, taxonomy in seq_io.read_seq(silva_lsu_ref,
                                                     keep_annotation=True):
            silva_lsu_taxonomy[seq_id] = taxonomy

        # parse BLAST tables
        print('Parsing BLAST tables.')

        ssu_table = os.path.join(output_dir, 'ssu_silva.tsv')
        self._parse_blast_table(ssu_blast_table, gtdb_taxonomy,
                                silva_ssu_taxonomy, self.min_ssu_len,
                                ssu_table)

        lsu_table = os.path.join(output_dir, 'lsu_silva.tsv')
        self._parse_blast_table(lsu_blast_table, gtdb_taxonomy,
                                silva_lsu_taxonomy, self.min_lsu_len,
                                lsu_table)
Пример #2
0
    def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file,
            gtdb_path_file, gtdb_metadata_file, output_dir):
        """Create FASTA files with all 16S and 23S rRNA sequences from GTDB genomes."""

        # get User ID to UBA translation
        print('Reading GTDB metadata to translate User IDs to UBA IDs.')
        user_id_to_uba = {}
        with open(gtdb_metadata_file) as f:
            f.readline()

            for line in f:
                line_split = line.strip().split('\t')
                gid = line_split[0]
                org_name = line_split[1]
                if '(UBA' in org_name:
                    uba_id = org_name.split('(')[-1].replace(')', '')
                    user_id_to_uba[gid] = uba_id

        # read GTDB taxonomy
        print('Reading GTDB taxonomy.')
        gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file)
        gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file)
        gtdb_taxonomy = gtdb_bac_taxonomy.copy()
        gtdb_taxonomy.update(gtdb_ar_taxonomy)

        print('Identified %d bacterial genomes to process.' %
              len(gtdb_bac_taxonomy))
        print('Identified %d archaeal genomes to process.' %
              len(gtdb_ar_taxonomy))
        print('Identified %d genomes to process.' % len(gtdb_taxonomy))

        # read genome paths
        print('Reading path to genomes.')
        genome_paths = {}
        for line in open(gtdb_path_file):
            gid, gid_path = line.strip().split('\t')
            if gid in user_id_to_uba:
                gid = user_id_to_uba[gid]

            genome_paths[gid] = gid_path

        # sanity check data
        missing_paths = set(gtdb_taxonomy.keys()) - set(genome_paths.keys())
        if len(missing_paths) > 0:
            print(
                '[WARNING] There are %d genomes in the taxonomy file without a specified genome path.'
                % len(missing_paths))

        # create FASTA file with 16S and 23S rRNA sequence files
        print('Parsing 16S and 23S rRNA sequence files.')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        fout_16S = open(os.path.join(output_dir, 'ssu.fna'), 'w')
        fout_23S = open(os.path.join(output_dir, 'lsu.fna'), 'w')
        missing_ssu = 0
        missing_lsu = 0
        for i, gid in enumerate(gtdb_taxonomy):
            if i % 1000 == 0:
                print('Processed %d genomes.' % i)

            if gid not in genome_paths:
                print(
                    '[WARNING] Genome %s does not have a specified genome path.'
                    % gid)
                continue

            genome_path = genome_paths[gid]

            ssu_file = os.path.join(genome_path, 'rna_silva', 'ssu.fna')
            if not os.path.exists(ssu_file):
                missing_ssu += 1
                continue

            ssu_info_file = os.path.join(genome_path, 'rna_silva',
                                         'ssu.hmm_summary.tsv')
            ssu_info = {}
            with open(ssu_info_file) as f:
                header = f.readline().strip().split('\t')
                contig_len_index = header.index('Sequence length')

                for line in f:
                    line_split = line.strip().split('\t')

                    gene_id = line_split[0]
                    contig_length = int(line_split[contig_len_index])
                    ssu_info[gene_id] = contig_length

            for ssu_index, (seq_id,
                            seq) in enumerate(seq_io.read_seq(ssu_file)):
                fout_16S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' %
                               (gid, seq_id, len(seq), ssu_info[seq_id]))
                fout_16S.write('%s\n' % seq)

            lsu_file = os.path.join(genome_path, 'rna_silva', 'lsu_23S.fna')
            if not os.path.exists(lsu_file):
                missing_lsu += 1
                continue

            lsu_info_file = os.path.join(genome_path, 'rna_silva',
                                         'lsu_23S.hmm_summary.tsv')
            lsu_info = {}
            with open(lsu_info_file) as f:
                header = f.readline().strip().split('\t')
                contig_len_index = header.index('Sequence length')

                for line in f:
                    line_split = line.strip().split('\t')

                    gene_id = line_split[0]
                    contig_length = int(line_split[contig_len_index])
                    lsu_info[gene_id] = contig_length

            for lsu_index, (seq_id,
                            seq) in enumerate(seq_io.read_seq(lsu_file)):
                fout_23S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' %
                               (gid, seq_id, len(seq), lsu_info[seq_id]))
                fout_23S.write('%s\n' % seq)

        fout_16S.close()
        fout_23S.close()

        print(
            'There were %d of %d (%.2f%%) genomes without an identifier 16S rRNA gene.'
            % (missing_ssu, len(gtdb_taxonomy),
               missing_ssu * 100.0 / len(gtdb_taxonomy)))

        print(
            'There were %d of %d (%.2f%%) genomes without an identifier 23S rRNA gene.'
            % (missing_lsu, len(gtdb_taxonomy),
               missing_lsu * 100.0 / len(gtdb_taxonomy)))
Пример #3
0
    def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file,
            ssu_silva_table, lsu_silva_table, ssu_info_file, lsu_info_file,
            output_dir):
        """Parse tables with SILVA assignments to identify potentially erroneous 16S and 23S rRNA genes in GTDB genomes."""

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        fout = open(os.path.join(output_dir, 'silva_incongruence_test.tsv'),
                    'w')
        fout.write('Genome ID\tTest\tIncongruent rank')
        fout.write(
            '\tSILVA taxon A\tSILVA taxon B\tSILVA taxonomy A\tSILVA taxonomy B'
        )
        fout.write('\tGTDB taxonomy\tNote')
        fout.write('\tIn reference tree A\tIn reference tree B')
        fout.write(
            '\tGene ID A\trRNA length\tContig length\tGene ID B\trRNA length\tContig length\n'
        )

        # read GTDB taxonomy
        print('Reading GTDB taxonomy.')
        gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file)
        gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file)
        gtdb_taxonomy = gtdb_bac_taxonomy.copy()
        gtdb_taxonomy.update(gtdb_ar_taxonomy)

        # read genomes in SSU and LSU trees
        print('Reading genomes in 16S and 23S gene trees.')
        ssu_ref = {}
        with open(ssu_info_file) as f:
            header = f.readline().strip().split('\t')
            rna_length_index = header.index('SSU gene length')
            contig_len_index = header.index('Sequence length')

            for line in f:
                line_split = line.strip().split('\t')

                gene_id = line_split[0]
                contig_id = line_split[1]
                gene_id = gene_id.split('~')[0] + '~' + contig_id

                rna_length = int(line_split[rna_length_index])
                contig_length = int(line_split[contig_len_index])
                ssu_ref[gene_id] = (rna_length, contig_length)

        lsu_ref = {}
        with open(lsu_info_file) as f:
            header = f.readline().strip().split('\t')
            rna_length_index = header.index('SSU gene length')
            contig_len_index = header.index('Sequence length')

            for line in f:
                line_split = line.strip().split('\t')

                gene_id = line_split[0]
                contig_id = line_split[1]
                gene_id = gene_id.split('~')[0] + '~' + contig_id

                rna_length = int(line_split[rna_length_index])
                contig_length = int(line_split[contig_len_index])
                lsu_ref[gene_id] = (rna_length, contig_length)

        # run tests to find potentially incongruent 16S or 23S rRNA genes
        print(
            'Performing tests to identify potentially incongruent 16S or 23S rRNA genes.'
        )
        self._multigene_silva_assignment_test(
            ssu_silva_table, gtdb_taxonomy, ssu_ref, 'SSU',
            'Genome has multiple 16S rRNA genes with incongrent SILVA assignments.',
            fout)

        self._multigene_silva_assignment_test(
            lsu_silva_table, gtdb_taxonomy, lsu_ref, 'LSU',
            'Genome has multiple 23S rRNA genes with incongrent SILVA assignments.',
            fout)

        self._ssu_lsu_silva_assignment_test(
            ssu_silva_table, lsu_silva_table, gtdb_taxonomy, ssu_ref, lsu_ref,
            'SSU/LSU',
            'Genome has a 16S and 23S rRNA gene with incongruent SILVA assignments.',
            fout)

        fout.close()