Пример #1
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)
        if not genome_files:
            self.logger.warning('  [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.')
            sys.exit()

        prodigal = Prodigal(options.cpus)
        summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.iteritems():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('')
        self.logger.info('  Identified genes written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Пример #2
0
    def call_genes(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        
        genome_files = self._input_files(options.input_genomes, options.file_ext)

        prodigal = Prodigal(options.cpus, not options.silent)
        summary_stats = prodigal.run(genome_files, 
                                        options.output_dir, 
                                        called_genes=False, 
                                        translation_table=options.force_table, 
                                        meta=False,
                                        closed_ends=True)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.items():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('Identified genes written to: %s' % options.output_dir)
Пример #3
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('  Genes in genomes written to: %s' % options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True)
            self.logger.info('  Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)

        self.time_keeper.print_time_stamp()
Пример #4
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue
              
            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                if False:
                    # for safety, I am just recalling genes for all genomes right now,
                    # but this is very efficient
                    aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa')
                    if os.path.exists(aa_gene_file):
                        # verify checksum
                        checksum_file = aa_gene_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(aa_gene_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            
            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()