def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_dir, options.genome_ext) if not genome_files: self.logger.warning(' [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.') sys.exit() prodigal = Prodigal(options.cpus) summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.iteritems(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('') self.logger.info(' Identified genes written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def call_genes(self, options): """Call genes command""" make_sure_path_exists(options.output_dir) genome_files = self._input_files(options.input_genomes, options.file_ext) prodigal = Prodigal(options.cpus, not options.silent) summary_stats = prodigal.run(genome_files, options.output_dir, called_genes=False, translation_table=options.force_table, meta=False, closed_ends=True) # write gene calling summary fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w') fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n') for genome_id, stats in summary_stats.items(): fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id, stats.best_translation_table, stats.coding_density_4, stats.coding_density_11)) fout.close() self.logger.info('Identified genes written to: %s' % options.output_dir)
def call_genes(self, options): """Call genes command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) make_sure_path_exists(options.output_dir) genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() # call genes in genomes prodigal = Prodigal(options.cpus) prodigal.run(genome_files, options.output_dir) self.logger.info(' Genes in genomes written to: %s' % options.output_dir) # call genes in unbinned scaffolds if options.unbinned_file: unbinned_output_dir = os.path.join(options.output_dir, 'unbinned') prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True) self.logger.info(' Genes in unbinned scaffolds written to: %s' % unbinned_output_dir) self.time_keeper.print_time_stamp()
def run(self, input_dir, tmp_dir, threads): # get path to all unprocessed genome files print 'Reading genomes.' genome_files = [] for genome_dir in os.listdir(input_dir): cur_genome_dir = os.path.join(input_dir, genome_dir) if not os.path.isdir(cur_genome_dir): continue for assembly_id in os.listdir(cur_genome_dir): assembly_dir = os.path.join(cur_genome_dir, assembly_id) genome_id = assembly_id[0:assembly_id.find('_', 4)] # check if prodigal has already been called if False: # for safety, I am just recalling genes for all genomes right now, # but this is very efficient aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa') if os.path.exists(aa_gene_file): # verify checksum checksum_file = aa_gene_file + '.sha256' if os.path.exists(checksum_file): checksum = sha256(aa_gene_file) cur_checksum = open(checksum_file).readline().strip() if checksum == cur_checksum: continue genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna') if os.path.exists(genome_file): if os.stat(genome_file).st_size == 0: print '[Warning] Genome file appears to be empty: %s' % genome_file else: genome_files.append(genome_file) print ' Number of unprocessed genomes: %d' % len(genome_files) # run prodigal on each genome print 'Running prodigal.' prodigal = Prodigal(cpus=threads) summary_stats = prodigal.run(genome_files, output_dir=tmp_dir) # move results into individual genome directories print 'Moving files and calculating checksums.' for genome_file in genome_files: genome_path, genome_id = ntpath.split(genome_file) genome_id = remove_extension(genome_id) aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna') gff_file = os.path.join(tmp_dir, genome_id + '.gff') genome_root = genome_id[0:genome_id.find('_', 4)] prodigal_path = os.path.join(genome_path, 'prodigal') if not os.path.exists(prodigal_path): os.makedirs(prodigal_path) new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa') new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna') new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff') os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file)) os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file)) os.system('mv %s %s' % (gff_file, new_gff_file)) # save translation table information translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv') fout = open(translation_table_file, 'w') fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table)) fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100)) fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100)) fout.close() checksum = sha256(new_aa_gene_file) fout = open(new_aa_gene_file + '.sha256', 'w') fout.write(checksum) fout.close()