def output_rpkm(sample, output_dir, settings_info, rna_base, logger): """ Output RPKM tables for the sample. Takes as input: - sample: a sample object - output_dir: output directory - settings_info: settings information - rna_base: an RNABase object """ # Output RPKM information for all constitutive exon tables in the # in the RNA Base print "Outputting RPKM for: %s" %(sample.label) rpkm_tables = {} for table_name, const_exons in rna_base.tables_to_const_exons.iteritems(): rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir, table_name)) rpkm_tables[table_name] = rpkm_output_filename if os.path.isfile(rpkm_output_filename): logger.info(" - Skipping RPKM output, found %s" %(rpkm_output_filename)) print " - Skipping RPKM output, %s exists" %(rpkm_output_filename) continue # Directory where BAM containing mapping to constitutive # exons be stored bam2gff_outdir = os.path.join(output_dir, "bam2gff_const_exons") utils.make_dir(bam2gff_outdir) # Map reads to GFF of constitutive exons # Use the rRNA subtracted BAM file print "Using constitutive exons GFF -> %s" %(const_exons.gff_filename) exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename, const_exons.gff_filename, bam2gff_outdir) # Compute RPKMs for sample num_mapped = int(sample.qc.qc_results["num_mapped"]) if num_mapped == 0: logger.critical("Cannot compute RPKMs since sample %s has 0 mapped reads." \ %(sample.label)) print "Error: Cannot compute RPKMs since sample %s has 0 mapped reads." \ %(sample.label) sys.exit(1) print "Sample %s has %s mapped reads" %(sample.label, num_mapped) read_len = settings_info["readlen"] logger.info("Outputting RPKM from GFF aligned BAM (table %s)" %(table_name)) output_rpkm_from_gff_aligned_bam(exons_bam_fname, num_mapped, read_len, const_exons, rpkm_output_filename) logger.info("Finished outputting RPKM for %s to %s" %(sample.label, rpkm_output_filename)) return rpkm_output_filename
def output_rpkm(sample, output_dir, settings_info, rna_base, logger): """ Output RPKM tables for the sample. Takes as input: - sample: a sample object - output_dir: output directory - settings_info: settings information - rna_base: an RNABase object """ # Output RPKM information for all constitutive exon tables in the # in the RNA Base print "Outputting RPKM for: %s" %(sample.label) rpkm_tables = {} for table_name, const_exons in rna_base.tables_to_const_exons.iteritems(): rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir, table_name)) rpkm_tables[table_name] = rpkm_output_filename if os.path.isfile(rpkm_output_filename): logger.info(" - Skipping RPKM output, found %s" %(rpkm_output_filename)) continue # Directory where BAM containing mapping to constitutive # exons be stored bam2gff_outdir = os.path.join(output_dir, "bam2gff_const_exons") utils.make_dir(bam2gff_outdir) # Map reads to GFF of constitutive exons # Use the rRNA subtracted BAM file exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename, const_exons.gff_filename, bam2gff_outdir) # Compute RPKMs for sample: use number of ribosub mapped reads num_mapped = int(sample.qc.qc_results["num_ribosub_mapped"]) if num_mapped == 0: logger.critical("Cannot compute RPKMs since sample %s has 0 " \ "mapped reads." %(sample.label)) sys.exit(1) logger.info("Sample %s has %s mapped reads" %(sample.label, num_mapped)) read_len = settings_info["readlen"] logger.info("Outputting RPKM from GFF aligned BAM (table %s)" \ %(table_name)) output_rpkm_from_gff_aligned_bam(exons_bam_fname, num_mapped, read_len, const_exons, rpkm_output_filename) logger.info("Finished outputting RPKM for %s to %s" %(sample.label, rpkm_output_filename)) return rpkm_output_filename
def compute_insert_len(bams_to_process, const_exons_gff_filename, output_dir, min_exon_size, no_bam_filter=False, sd_max=2): """ Compute insert length distribution and output it to the given directory. Arguments: - bams_to_process: a list of BAM files to process - const_gff_filename: GFF with constitutive exons """ bams_str = "\n ".join(bams_to_process) num_bams = len(bams_to_process) print "Computing insert length distribution of %d files:\n %s" \ %(num_bams, bams_str) print " - Using const. exons from: %s" %(const_exons_gff_filename) print " - Outputting to: %s" %(output_dir) print " - Minimum exon size used: %d" %(min_exon_size) if not os.path.isdir(output_dir): print "Making directory: %s" %(output_dir) os.makedirs(output_dir) all_constitutive = True const_exons, f = \ exon_utils.get_const_exons_by_gene(const_exons_gff_filename, output_dir, # Treat all exons as constitutive all_constitutive=True, min_size=min_exon_size) filter_reads = not no_bam_filter if filter_reads: print "Filtering BAM reads" else: print "Turning off filtering of BAM reads" for bam_filename in bams_to_process: t1 = time.time() output_filename = os.path.join(output_dir, "%s.insert_len" \ %(os.path.basename(bam_filename))) if not os.path.isfile(bam_filename): print "Cannot find BAM file %s" %(bam_filename) print "Quitting..." sys.exit(1) print "Fetching reads in constitutive exons" mapped_bam_filename = exon_utils.map_bam2gff(bam_filename, const_exons_gff_filename, output_dir) if mapped_bam_filename == None: raise Exception, "Error: Insert length computation failed." # Load mapped BAM filename mapped_bam = pysam.Samfile(mapped_bam_filename, "rb") ### ### TODO: Rewrite this so that you only pair reads within an interval ### paired_reads = sam_utils.pair_sam_reads(mapped_bam, filter_reads=filter_reads) num_paired_reads = len(paired_reads) if num_paired_reads == 0: print "WARNING: no paired mates in %s. Skipping...\n"\ "Are you sure the read IDs match? If your BAM paired flags are "\ "unset, try using --no-bam-filter." \ %(bam_filename) continue print "Using %d paired mates" %(num_paired_reads) interval_to_paired_dists = compute_inserts_from_paired_mates(paired_reads) summarize_insert_len_dist(interval_to_paired_dists, output_filename, sd_max=sd_max) t2 = time.time() print "Insert length computation took %.2f seconds." %(t2 - t1)
def compute_insert_len(bams_to_process, const_exons_gff_filename, output_dir, min_exon_size, no_bam_filter=False, sd_max=2): """ Compute insert length distribution and output it to the given directory. Arguments: - bams_to_process: a list of BAM files to process - const_gff_filename: GFF with constitutive exons """ bams_str = "\n ".join(bams_to_process) num_bams = len(bams_to_process) print "Computing insert length distribution of %d files:\n %s" \ %(num_bams, bams_str) print " - Using const. exons from: %s" % (const_exons_gff_filename) print " - Outputting to: %s" % (output_dir) print " - Minimum exon size used: %d" % (min_exon_size) if not os.path.isdir(output_dir): print "Making directory: %s" % (output_dir) os.makedirs(output_dir) all_constitutive = True const_exons, f = \ exon_utils.get_const_exons_by_gene(const_exons_gff_filename, output_dir, # Treat all exons as constitutive all_constitutive=True, min_size=min_exon_size) filter_reads = not no_bam_filter if filter_reads: print "Filtering BAM reads" else: print "Turning off filtering of BAM reads" for bam_filename in bams_to_process: t1 = time.time() output_filename = os.path.join(output_dir, "%s.insert_len" \ %(os.path.basename(bam_filename))) if not os.path.isfile(bam_filename): print "Cannot find BAM file %s" % (bam_filename) print "Quitting..." sys.exit(1) print "Fetching reads in constitutive exons" mapped_bam_filename = exon_utils.map_bam2gff(bam_filename, const_exons_gff_filename, output_dir) if mapped_bam_filename == None: raise Exception, "Error: Insert length computation failed." # Load mapped BAM filename mapped_bam = pysam.Samfile(mapped_bam_filename, "rb") ### ### TODO: Rewrite this so that you only pair reads within an interval ### paired_reads = sam_utils.pair_sam_reads(mapped_bam, filter_reads=filter_reads) num_paired_reads = len(paired_reads) if num_paired_reads == 0: print "WARNING: no paired mates in %s. Skipping...\n"\ "Are you sure the read IDs match? If your BAM paired flags are "\ "unset, try using --no-bam-filter." \ %(bam_filename) continue print "Using %d paired mates" % (num_paired_reads) interval_to_paired_dists = compute_inserts_from_paired_mates( paired_reads) summarize_insert_len_dist(interval_to_paired_dists, output_filename, sd_max=sd_max) t2 = time.time() print "Insert length computation took %.2f seconds." % (t2 - t1)