def test_write(self):
        tln = TlnTableSummaryFile(self.dir_tmp, 'tst')
        tln.add_genome('a', 4)
        tln.add_genome('b', 11)
        tln.write()

        lines = set()
        with open(tln.path) as fh:
            [lines.add(x) for x in fh.readlines()]
        self.assertSetEqual({'a\t4\n', 'b\t11\n'}, lines)
示例#2
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR122 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar122_copy_number_file = CopyNumberFileAR122(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in sorted(gene_dict.items()):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar122_copy_number_file.add_genome(db_genome_id,
                                              info.get("aa_gene_path"),
                                              pfam_tophit_file,
                                              tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar122_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        symlink_f(
            PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_AR122_MARKER_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_AR122_MARKER_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
            os.path.join(
                outdir,
                os.path.basename(
                    PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
示例#3
0
文件: markers.py 项目: alienzj/GTDBTk
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix,
                                        write_single_copy_genes):
        """Report statistics for identified marker genes."""

        # Summarise the copy number of each AR53 and BAC120 markers.
        tln_summary_file = TlnTableSummaryFile(outdir, prefix)
        ar53_copy_number_file = CopyNumberFileAR53(outdir, prefix)
        bac120_copy_number_file = CopyNumberFileBAC120(outdir, prefix)

        # Process each genome.
        for db_genome_id, info in tqdm_log(sorted(gene_dict.items()),
                                           unit='genome'):
            cur_marker_dir = os.path.join(outdir, DIR_MARKER_GENE)
            pfam_tophit_file = TopHitPfamFile(cur_marker_dir, db_genome_id)
            tigr_tophit_file = TopHitTigrFile(cur_marker_dir, db_genome_id)
            pfam_tophit_file.read()
            tigr_tophit_file.read()

            # Summarise each of the markers for this genome.
            ar53_copy_number_file.add_genome(db_genome_id,
                                             info.get("aa_gene_path"),
                                             pfam_tophit_file,
                                             tigr_tophit_file)
            bac120_copy_number_file.add_genome(db_genome_id,
                                               info.get("aa_gene_path"),
                                               pfam_tophit_file,
                                               tigr_tophit_file)

            # Write the best translation table to disk for this genome.
            tln_summary_file.add_genome(db_genome_id,
                                        info.get("best_translation_table"))

        # Write each of the summary files to disk.
        ar53_copy_number_file.write()
        bac120_copy_number_file.write()
        tln_summary_file.write()

        # Create a symlink to store the summary files in the root.
        # symlink_f(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_BAC120_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_AR53_MARKER_SUMMARY.format(prefix=prefix))))
        # symlink_f(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix),
        #           os.path.join(outdir, os.path.basename(PATH_TLN_TABLE_SUMMARY.format(prefix=prefix))))
        symlink_f(
            PATH_FAILS.format(prefix=prefix),
            os.path.join(outdir,
                         os.path.basename(PATH_FAILS.format(prefix=prefix))))

        # Write the single copy AR53/BAC120 FASTA files to disk.
        if write_single_copy_genes:
            fasta_dir = os.path.join(outdir, DIR_IDENTIFY_FASTA)
            self.logger.info(
                f'Writing unaligned single-copy genes to: {fasta_dir}')

            # Iterate over each domain.
            marker_doms = list()
            marker_doms.append(
                (Config.AR53_MARKERS['PFAM'] + Config.AR53_MARKERS['TIGRFAM'],
                 ar53_copy_number_file, 'ar53'))
            marker_doms.append((Config.BAC120_MARKERS['PFAM'] +
                                Config.BAC120_MARKERS['TIGRFAM'],
                                bac120_copy_number_file, 'bac120'))
            for marker_names, marker_file, marker_d in marker_doms:

                # Create the domain-specific subdirectory.
                fasta_d_dir = os.path.join(fasta_dir, marker_d)
                make_sure_path_exists(fasta_d_dir)

                # Iterate over each marker.
                for marker_name in marker_names:
                    marker_name = marker_name.rstrip(r'\.[HMMhmm]')
                    marker_path = os.path.join(fasta_d_dir,
                                               f'{marker_name}.fa')

                    to_write = list()
                    for genome_id in sorted(gene_dict):
                        unq_hits = marker_file.get_single_copy_hits(genome_id)
                        if marker_name in unq_hits:
                            to_write.append(f'>{genome_id}')
                            to_write.append(unq_hits[marker_name]['seq'])

                    if len(to_write) > 0:
                        with open(marker_path, 'w') as fh:
                            fh.write('\n'.join(to_write))
 def test_add_genome_raises_exception(self):
     tln = TlnTableSummaryFile(self.dir_tmp, 'tst')
     tln.add_genome('a', 4)
     self.assertRaises(GTDBTkExit, tln.add_genome, 'a', 11)
 def test_add_genome(self):
     tln = TlnTableSummaryFile(self.dir_tmp, 'tst')
     tln.add_genome('a', 4)
     tln.add_genome('b', 11)
     self.assertDictEqual({'a': 4, 'b': 11}, tln.genomes)