def main(): from optparse import OptionParser logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>") parser.add_option('--min-fragment-length', dest="min_fragment_length", type="int", default=0) parser.add_option('--max-fragment-length', dest="max_fragment_length", type="int", default=1000) parser.add_option('--max-samples', dest="max_samples", type="int", default=None) parser.add_option('-o', dest="output_file", default=None) options, args = parser.parse_args() input_bam_file = args[0] bamfh = pysam.Samfile(input_bam_file, "rb") isizedist = InsertSizeDistribution.from_bam(bamfh, options.min_fragment_length, options.max_fragment_length, options.max_samples) bamfh.close() if options.output_file is not None: f = open(options.output_file, "w") else: f = sys.stdout isizedist.to_file(f) if options.output_file is not None: f.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isizedist.n, isizedist.mean(), isizedist.std(), isizedist.percentile(50.0), isizedist.mode()))
def main(): from optparse import OptionParser logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <in.txt> <out.txt> <isizedist.txt>") parser.add_option("--min-isize-prob", dest="min_isize_prob", type="float", default=0.01) options, args = parser.parse_args() input_file = args[0] output_file = args[1] isize_dist_file = args[2] # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) resolve_discordant_reads(input_file, output_file, isize_dist, options.min_isize_prob, tmp_dir=".")
def main(): import sys calc_chimera_pvalues(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])) return from optparse import OptionParser logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <in.txt> <bowtie.log> <isizedist.txt>") parser.add_option("--min-isize-prob", dest="min_isize_prob", type="float", default=0.01) options, args = parser.parse_args() input_file = args[0] bowtie_log_file = args[1] isize_dist_file = args[2] # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) calc_percent_discordant_reads(input_file, bowtie_log_file, isize_dist, min_isize_prob=options.min_isize_prob, tmp_dir=".")
def main(): from optparse import OptionParser logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser( "usage: %prog [options] <in.txt> <out.txt> <isizedist.txt>") parser.add_option("--min-isize-prob", dest="min_isize_prob", type="float", default=0.01) options, args = parser.parse_args() input_file = args[0] output_file = args[1] isize_dist_file = args[2] # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) resolve_discordant_reads(input_file, output_file, isize_dist, options.min_isize_prob, tmp_dir=".")
def main(): from optparse import OptionParser logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>") parser.add_option('--min-fragment-length', dest="min_fragment_length", type="int", default=0) parser.add_option('--max-fragment-length', dest="max_fragment_length", type="int", default=1000) parser.add_option('--max-samples', dest="max_samples", type="int", default=None) parser.add_option('-o', dest="output_file", default=None) options, args = parser.parse_args() input_bam_file = args[0] bamfh = pysam.Samfile(input_bam_file, "rb") isizedist = InsertSizeDistribution.from_bam(bamfh, options.min_fragment_length, options.max_fragment_length, options.max_samples) bamfh.close() if options.output_file is not None: f = open(options.output_file, "w") else: f = sys.stdout isizedist.to_file(f) if options.output_file is not None: f.close() logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isizedist.n, isizedist.mean(), isizedist.std(), isizedist.percentile(50.0), isizedist.mode()))
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file, output_file, trim_bp, max_read_length, homology_mismatches): """ homology_mismatches: number of mismatches to tolerate while computing homology between chimeric breakpoint sequence and "wildtype" sequence trim_bp: when selecting the best matching exon for each read, we account for spurious overlap into adjacent exons by trimming the read by 'trim_bp' """ # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_tx_map = build_tid_tx_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # iterate through read pairs outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for r5p, r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist) # extract the sequence of the breakpoint along with the # number of homologous bases at the breakpoint between # chimera and wildtype genes for breakpoint in breakpoints: exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) # write breakpoint information for each read to a file fields = [ tx5p.tx_name, 0, tx_end_5p, tx3p.tx_name, tx_start_3p, tx3p.tx_end, r5p.rname, # name isize_prob, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 # user defined fields exon_num_5p, exon_num_3p, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >> outfh, '\t'.join(map(str, fields)) # cleanup ref_fa.close() outfh.close() bamfh.close() return config.JOB_SUCCESS
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update( [line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update( [line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int( open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES ] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all( up_to_date(cfq, fq) for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join( tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all( up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe( transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join( runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext( sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file( open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam( bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random( runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile( DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug( "\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug( "\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe( index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all( up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments( transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome( genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join( tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join( tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads( discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file, ) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters( discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file, ) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints( index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join( tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments( cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join( runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % ( unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras( input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, trim_bp, max_read_length, homology_mismatches): # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tx_name_gene_map = build_tx_name_gene_map(gene_file, rname_prefix=None) #genome_tx_trees = build_genome_tx_trees(gene_file) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # keep track of mapping from breakpoint sequence to breakpoint id # this requires storing all breakpoint sequences in memory which is # potentially expensive. TODO: investigate whether this should be # moved to a separate sort-update-sort procedure breakpoint_seq_name_map = {} breakpoint_num = 1 # group discordant read pairs by gene logging.debug("Parsing discordant reads") chimera_num = 1 outfh = open(output_file, "w") for tx_name_5p, tx_name_3p, frags in parse_discordant_bedpe_by_transcript_pair(open(input_file)): # get gene information tx5p = tx_name_gene_map[tx_name_5p] tx3p = tx_name_gene_map[tx_name_3p] # bin fragments into putative breakpoints breakpoint_dict = collections.defaultdict(lambda: []) for dr5p,dr3p in frags: # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, trim_bp, isize_dist) for breakpoint in breakpoints: breakpoint_dict[breakpoint].append((dr5p, dr3p)) # iterate through breakpoints and build chimera candidates for breakpoint,frags in breakpoint_dict.iteritems(): exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) tx3p_length = sum((end - start) for start,end in tx3p.exons) # get unique breakpoint id based on sequence breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p if breakpoint_seq in breakpoint_seq_name_map: breakpoint_name = breakpoint_seq_name_map[breakpoint_seq] else: breakpoint_name = "B%07d" % (breakpoint_num) breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name breakpoint_num += 1 # write gene, breakpoint, and raw reads to a file and follow the # BEDPE format gene_name_5p = '_'.join(tx5p.gene_name.split()) gene_name_3p = '_'.join(tx3p.gene_name.split()) fields = [tx5p.tx_name, 0, tx_end_5p, # chrom1, start1, end1 tx3p.tx_name, tx_start_3p, tx3p_length, # chrom2, start2, end2 "C%07d" % (chimera_num), # name 1.0, # pvalue tx5p.strand, tx3p.strand, # strand1, strand2 gene_name_5p, gene_name_3p, # gene names # exon interval information '%d-%d' % (0, exon_num_5p), '%d-%d' % (exon_num_3p, len(tx3p.exons)), # breakpoint information breakpoint_name, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right, # fragments frags_to_encomp_string(frags), # spanning reads None] print >>outfh, '\t'.join(map(str, fields)) chimera_num += 1 outfh.close() ref_fa.close() return config.JOB_SUCCESS
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >>fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update([line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update([line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int(open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all(up_to_date(cfq, fq) for cfq,fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join(tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all(up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all(up_to_date(a,b) for a,b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe(transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join(runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext(sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam(bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random(runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile(DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug("\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug("\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all(up_to_date(a,b) for a,b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe(index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all(up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments(transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join(tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads(discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file,) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters(discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file,) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints(index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join(tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments(cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % (unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras(input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, trim_bp, max_read_length, homology_mismatches): # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript information") transcript_feature_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_feature_file))) tx_id_map = build_transcript_map(transcripts) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE) ref_fa = pysam.Fastafile(ref_fasta_file) # keep track of mapping from breakpoint sequence to breakpoint id # this requires storing all breakpoint sequences in memory which is # potentially expensive. TODO: investigate whether this should be # moved to a separate sort-update-sort procedure breakpoint_seq_name_map = {} breakpoint_num = 1 # group discordant read pairs by gene logging.debug("Parsing discordant reads") chimera_num = 1 outfh = open(output_file, "w") for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair( open(input_file)): # get gene information tx5p = tx_id_map[tx_id_5p] tx3p = tx_id_map[tx_id_3p] # bin fragments into putative breakpoints breakpoint_dict = collections.defaultdict(lambda: []) for dr5p, dr3p in frags: # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, trim_bp, isize_dist) for breakpoint in breakpoints: breakpoint_dict[breakpoint].append((dr5p, dr3p)) # iterate through breakpoints and build chimera candidates for breakpoint, frags in breakpoint_dict.iteritems(): exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches) tx3p_length = sum((end - start) for start, end in tx3p.exons) # get unique breakpoint id based on sequence breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p if breakpoint_seq in breakpoint_seq_name_map: breakpoint_name = breakpoint_seq_name_map[breakpoint_seq] else: breakpoint_name = "B%07d" % (breakpoint_num) breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name breakpoint_num += 1 # write gene, breakpoint, and raw reads to a file and follow the # BEDPE format gene_names_5p = ",".join( sorted(set(["_".join(x.split()) for x in tx5p.gene_names]))) gene_names_3p = ",".join( sorted(set(["_".join(x.split()) for x in tx3p.gene_names]))) fields = [ tx5p.tx_id, 0, tx_end_5p, # chrom1, start1, end1 tx3p.tx_id, tx_start_3p, tx3p_length, # chrom2, start2, end2 "C%07d" % (chimera_num), # name 1.0, # pvalue tx5p.strand, tx3p.strand, # strand1, strand2 gene_names_5p, gene_names_3p, # gene names # exon interval information '%d-%d' % (0, exon_num_5p), '%d-%d' % (exon_num_3p, len(tx3p.exons)), # breakpoint information breakpoint_name, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right, # fragments frags_to_encomp_string(frags), # spanning reads None ] print >> outfh, '\t'.join(map(str, fields)) chimera_num += 1 outfh.close() ref_fa.close() return config.JOB_SUCCESS
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file, output_file, trim_bp, max_read_length, homology_mismatches): """ homology_mismatches: number of mismatches to tolerate while computing homology between chimeric breakpoint sequence and "wildtype" sequence trim_bp: when selecting the best matching exon for each read, we account for spurious overlap into adjacent exons by trimming the read by 'trim_bp' """ # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_tx_map = build_tid_tx_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # iterate through read pairs outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist) # extract the sequence of the breakpoint along with the # number of homologous bases at the breakpoint between # chimera and wildtype genes for breakpoint in breakpoints: exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) # write breakpoint information for each read to a file fields = [tx5p.tx_name, 0, tx_end_5p, tx3p.tx_name, tx_start_3p, tx3p.tx_end, r5p.rname, # name isize_prob, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 # user defined fields exon_num_5p, exon_num_3p, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) # cleanup ref_fa.close() outfh.close() bamfh.close() return config.JOB_SUCCESS