def compatible(self, options): """Compatible command""" check_file_exists(options.reference_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) # read scaffold statistics and calculate genome stats self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify putative homologs to reference genomes reference = Reference(1, None) putative_homologs = reference.homology_check(options.reference_file, options.min_genes, float(options.perc_genes)) # identify scaffolds compatible with bins outliers = Outliers() output_file = os.path.join(options.output_dir, 'compatible.tsv') outliers.compatible(putative_homologs, scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, output_file) self.logger.info('Results written to: ' + output_file)
def cluster(self, options): """Cluster command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) cluster = Cluster(options.cpus) cluster.run(scaffold_stats, options.num_clusters, options.num_components, options.K, options.no_coverage, options.no_pca, options.iterations, options.genome_file, options.output_dir) self.logger.info('') self.logger.info(' Partitioned sequences written to: ' + options.output_dir) self.time_keeper.print_time_stamp()
def outliers(self, options): """Outlier command""" check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify outliers outliers = Outliers() outlier_file = os.path.join(options.output_dir, 'outliers.tsv') outliers.identify(scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, outlier_file) self.logger.info('Outlier information written to: ' + outlier_file) # create outlier plots if not options.no_plots: plot_dir = os.path.join(options.output_dir, 'plots') make_sure_path_exists(plot_dir) outliers.plot(scaffold_stats, genome_stats, outliers.gc_dist, outliers.td_dist, options, options.highlight_file, options.links_file, options.individual_plots, plot_dir) self.logger.info('Outlier plots written to: ' + plot_dir)
def scaffold_stats(self, options): """Scaffold statistics command""" check_file_exists(options.scaffold_file) if not self._check_nuclotide_seqs([options.scaffold_file]): self.logger.warning( 'Scaffold file must contain nucleotide sequences.') sys.exit() genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() make_sure_path_exists(options.output_dir) # get coverage information if not options.coverage_file: if not options.bam_files: self.logger.warning( 'One or more BAM files must be specified in order to calculate coverage profiles.' ) coverage_file = None else: coverage = Coverage(options.cpus) coverage_file = os.path.join(options.output_dir, 'coverage.tsv') coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist) self.logger.info('Coverage profiles written to: %s' % coverage_file) else: check_file_exists(options.coverage_file) coverage_file = options.coverage_file # get tetranucleotide signatures if not options.tetra_file: tetra = Tetranucleotide(options.cpus) tetra_file = os.path.join(options.output_dir, 'tetra.tsv') signatures = tetra.run(options.scaffold_file) tetra.write(signatures, tetra_file) self.logger.info('Tetranucleotide signatures written to: %s' % tetra_file) else: tetra_file = options.tetra_file # write out scaffold statistics stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv') stats = ScaffoldStats(options.cpus) stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output) self.logger.info('Scaffold statistic written to: %s' % stats_output)
def scaffold_stats(self, options): """Scaffold statistics command""" print options self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_file) if not self._check_nuclotide_seqs([options.scaffold_file]): self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.') sys.exit() genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() make_sure_path_exists(options.output_dir) # get coverage information if not options.coverage_file: if not options.bam_files: self.logger.warning('\n [Warning] One or more BAM files must be specified in order to calculate coverage profiles.') coverage_file = None else: coverage = Coverage(options.cpus) coverage_file = os.path.join(options.output_dir, 'coverage.tsv') coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist) self.logger.info('') self.logger.info(' Coverage profiles written to: %s' % coverage_file) else: coverage_file = options.coverage_file # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF if not options.tetra_file: self.logger.info('') tetra = Tetranucleotide(options.cpus) tetra_file = os.path.join(options.output_dir, 'tetra.tsv') signatures = tetra.run(options.scaffold_file) tetra.write(signatures, tetra_file) self.logger.info(' Tetranucleotide signatures written to: %s' % tetra_file) else: tetra_file = options.tetra_file # write out scaffold statistics stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv') stats = ScaffoldStats(options.cpus) stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output) self.logger.info(' Scaffold statistic written to: %s' % stats_output) self.time_keeper.print_time_stamp()
def genome_stats(self, options): """Genomes statistics command""" check_file_exists(options.scaffold_stats_file) self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats(options.cpus) scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats.run(scaffold_stats) genome_stats.write(options.output_file) self.logger.info('Genome statistic written to: %s' % options.output_file)
def split(self, options): """Split command""" check_file_exists(options.scaffold_stats_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) cluster = Cluster(1) cluster.split(scaffold_stats, options.criteria1, options.criteria2, options.genome_file, options.output_dir) self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def dbscan(self, options): """dbscan command""" check_file_exists(options.scaffold_stats_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) cluster = Cluster(options.cpus) cluster.dbscan(scaffold_stats, options.num_clusters, options.num_components, options.min_pts, options.dist_frac, options.no_coverage, options.no_pca, options.genome_file, options.output_dir) self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def genome_stats(self, options): """Genomes statistics command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - genome_stats] Calculating statistics for genomes.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats(options.cpus) scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats.run(scaffold_stats) genome_stats.write(options.output_file) self.logger.info(' Genome statistic written to: %s' % options.output_file) self.time_keeper.print_time_stamp()
def compatible(self, options): """Compatible command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.') self.logger.info('*******************************************************************************') check_file_exists(options.reference_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) # read scaffold statistics and calculate genome stats self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify putative homologs to reference genomes reference = Reference(1, None) putative_homologs = reference.homology_check(options.reference_file, options.min_genes, float(options.perc_genes)) # identify scaffolds compatible with bins outliers = Outliers() output_file = os.path.join(options.output_dir, 'compatible.tsv') outliers.compatible(putative_homologs, scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, output_file) self.logger.info('') self.logger.info(' Results written to: ' + output_file) self.time_keeper.print_time_stamp()
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold of valid hits. per_identity : float Percent identity threshold of valid hits [0,100]. per_aln_len : float Percent query coverage of valid hits [0, 100]. """ # read statistics file self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('Creating DIAMOND database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.create_db(ref_gene_file, ref_diamond_db) self.logger.info('Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes) self.logger.info('Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.items(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs') fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.items(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_bin_id, subject_gene_id = hit.subject_id.split('~') subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 sorted_subject_bin_ids = sorted(subject_bin_ids.items(), key=operator.itemgetter(1), reverse=True) subject_bin_id_str = [] for bin_id, num_hits in sorted_subject_bin_ids: subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), key=operator.itemgetter(1), reverse=True) subject_scaffold_id_str = [] for subject_id, num_hits in sorted_subject_scaffold_ids: subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_bin_id_str, subject_scaffold_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def outliers(self, options): """Outlier command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - outliers] Identifying scaffolds with divergent characteristics.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) genome_stats = GenomeStats() genome_stats = genome_stats.run(scaffold_stats) # identify outliers outliers = Outliers() outlier_file = os.path.join(options.output_dir, 'outliers.tsv') outliers.identify(scaffold_stats, genome_stats, options.gc_perc, options.td_perc, options.cov_corr, options.cov_perc, options.report_type, outlier_file) self.logger.info(' Outlier information written to: ' + outlier_file) # create outlier plots self.logger.info('') highlight_scaffolds_ids = {} if options.highlight_file: for line in open(options.highlight_file): line_split = line.strip().split('\t') if len(line_split) > 1: highlight_scaffolds_ids[line_split[0]] = [float(x.strip()) / 255.0 for x in line_split[1].split(',')] else: highlight_scaffolds_ids[line_split[0]] = [1.0, 0, 0] link_scaffold_ids = [] if options.links_file: with open(options.links_file) as links_file: for line in links_file: #print line.strip().split('\t') link_scaffold_ids.append([ast.literal_eval(item) if i not in (0,2) else item for i,item in enumerate((line.strip().split('\t')))]) #link_scaffold_ids.append(line.strip().split('\t') for line in open(options.links_file)) #print list(link_scaffold_ids[0]) # create plots genomes_processed = 0 plot_dir = os.path.join(options.output_dir, 'plots') make_sure_path_exists(plot_dir) genome_plots = defaultdict(list) for genome_id, gs in genome_stats.iteritems(): genomes_processed += 1 sys.stdout.write(' Plotting scaffold distribution for %d of %d (%.1f%%) genomes.\r' % (genomes_processed, len(genome_stats), genomes_processed * 100.0 / len(genome_stats))) sys.stdout.flush() genome_scaffold_stats = {} for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]: genome_scaffold_stats[scaffold_id] = scaffold_stats.stats[scaffold_id] if options.individual_plots: #~ # GC plot #~ gc_plots = GcPlots(options) #~ gc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_gc, outliers.gc_dist, [options.gc_perc]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_plots.' + options.image_type) #~ gc_plots.save_plot(output_plot, dpi=options.dpi) #~ gc_plots.save_html(os.path.join(plot_dir, genome_id + '.gc_plots.html')) # TD plot td_plots = TdPlots(options) td_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_signature, outliers.td_dist, [options.td_perc]) output_plot = os.path.join(plot_dir, genome_id + '.td_plots.' + options.image_type) td_plots.save_plot(output_plot, dpi=options.dpi) td_plots.save_html(os.path.join(plot_dir, genome_id + '.td_plots.html')) #~ # mean absolute deviation of coverage profiles #~ cov_perc_plots = CovPercPlots(options) #~ cov_perc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_coverage, [options.cov_perc]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_perc.' + options.image_type) #~ cov_perc_plots.save_plot(output_plot, dpi=options.dpi) #~ cov_perc_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_perc.html')) #~ #~ # coverage correlation plots #~ if len(gs.mean_coverage) > 1: #~ cov_corr_plots = CovCorrPlots(options) #~ cov_corr_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, gs.mean_coverage, [options.cov_corr]) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_corr.' + options.image_type) #~ cov_corr_plots.save_plot(output_plot, dpi=options.dpi) #~ cov_corr_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_corr.html')) #~ # combined distribution, GC vs. coverage, and tetranucleotide signature plots #~ combined_plots = CombinedPlots(options) #~ combined_plots.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, link_scaffold_ids, gs, #~ outliers.gc_dist, outliers.td_dist, #~ options.gc_perc, options.td_perc, options.cov_perc) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.combined.' + options.image_type) #~ combined_plots.save_plot(output_plot, dpi=options.dpi) #~ combined_plots.save_html(os.path.join(plot_dir, genome_id + '.combined.html')) #~ #~ genome_plots[genome_id].append(('Combined', genome_id + '.combined.html')) #~ #~ # combined plot of distributions #~ dist_plots = DistributionPlots(options) #~ dist_plots.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, #~ link_scaffold_ids, #~ gs, #~ outliers.gc_dist, outliers.td_dist, #~ options.gc_perc, options.td_perc, options.cov_perc) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.dist_plot.' + options.image_type) #~ dist_plots.save_plot(output_plot, dpi=options.dpi) #~ dist_plots.save_html(os.path.join(plot_dir, genome_id + '.dist_plot.html')) #~ #~ genome_plots[genome_id].append(('Distributions', genome_id + '.dist_plot.html')) #~ #~ # GC vs. coverage plot #~ gc_cov_plot = GcCovPlot(options) #~ gc_cov_plot.plot(genome_scaffold_stats, #~ highlight_scaffolds_ids, link_scaffold_ids, #~ gs.mean_gc, gs.mean_coverage) #~ #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_coverge.' + options.image_type) #~ gc_cov_plot.save_plot(output_plot, dpi=options.dpi) #~ gc_cov_plot.save_html(os.path.join(plot_dir, genome_id + '.gc_coverge.html')) #~ #~ genome_plots[genome_id].append(('GC vs. coverage', genome_id + '.gc_coverge.html')) # tetranucleotide signature PCA plot tetra = TetraPcaPlot(options) tetra.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids) output_plot = os.path.join(plot_dir, genome_id + '.tetra_pca.' + options.image_type) tetra.save_plot(output_plot, dpi=options.dpi) tetra.save_html(os.path.join(plot_dir, genome_id + '.tetra_pca.html')) genome_plots[genome_id].append(('Tetra PCA', genome_id + '.tetra_pca.html')) sys.stdout.write('\n') outliers.create_html_index(plot_dir, genome_plots) self.logger.info(' Outlier plots written to: ' + plot_dir) self.time_keeper.print_time_stamp()
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. """ # read statistics file self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('') self.logger.info(' Creating diamond database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.make_database(ref_gene_file, ref_diamond_db) self.logger.info(' Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes) self.logger.info(' Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa) hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.iteritems(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids') fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.iteritems(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_id, subject_bin_id = hit.subject_id.split('~') subject_scaffold_id = subject_id[0:subject_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 subject_scaffold_id_str = [] for subject_id, num_hits in subject_scaffold_ids.iteritems(): subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) subject_bin_id_str = [] for bin_id, num_hits in subject_bin_ids.iteritems(): subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_scaffold_id_str, subject_bin_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out