def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file, bowtie_build_bin): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # copy reference fasta file to output dir and index it index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa") msg = "Adding reference genome to index" if (up_to_date(index_fasta_file, genome_fasta_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) shutil.copyfile(genome_fasta_file, index_fasta_file) # index the genome fasta file logging.info("Indexing FASTA file") fh = pysam.Fastafile(index_fasta_file) fh.close() # add gene sequences to index dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE) msg = "Building transcriptome sequences and gene features" if (up_to_date(index_fasta_file, gene_feature_file) and up_to_date(dst_gene_feature_file, gene_feature_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) # write sequences from gene feature file logging.info("Adding transcript sequences and gene features to index") fasta_fh = open(index_fasta_file, "a") gene_fh = open(dst_gene_feature_file, "w") for g, fa_record in genepred_to_fasta(gene_feature_file, index_fasta_file): print >>gene_fh, str(g) print >>fasta_fh, fa_record gene_fh.close() fasta_fh.close() # remove old fasta index if os.path.exists(index_fasta_file + ".fai"): os.remove(index_fasta_file + ".fai") # index the combined fasta file logging.info("Reindexing the FASTA file") fh = pysam.Fastafile(index_fasta_file) fh.close() # build bowtie index on the reference sequence file bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE) msg = "Building bowtie index" if up_to_date(bowtie_index_file, index_fasta_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX) args = [bowtie_build_bin, index_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("bowtie-build failed to create alignment index") if os.path.exists(bowtie_index_file): os.remove(bowtie_index_file) return JOB_ERROR logging.info("Chimerascan index created successfully") return JOB_SUCCESS
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >> fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update( [line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update( [line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int( open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [ os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES ] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all( up_to_date(cfq, fq) for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join( tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all( up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe( transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join( runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext( sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file( open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam( bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random( runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile( DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug( "\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug( "\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple( os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all( up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe( index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all( up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments( transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join( tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome( genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join( tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join( tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join( tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads( discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file, ) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters( discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file, ) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints( index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join( tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments( cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join( runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % ( unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras( input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, transcript_feature_file): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # copy reference fasta file to output dir and index it dst_genome_fasta_file = os.path.join(output_dir, config.GENOME_FASTA_FILE) msg = "Adding reference genome" if (up_to_date(dst_genome_fasta_file, genome_fasta_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) shutil.copyfile(genome_fasta_file, dst_genome_fasta_file) # index the genome fasta file logging.info("Indexing FASTA file") fh = pysam.Fastafile(dst_genome_fasta_file) fh.close() # add gene sequences to index dst_transcript_feature_file = os.path.join(output_dir, config.TRANSCRIPT_FEATURE_FILE) transcript_fasta_file = os.path.join(output_dir, config.TRANSCRIPTOME_FASTA_FILE) multimapping_file = os.path.join(output_dir, config.MAX_MULTIMAPPING_FILE) msg = "Building transcriptome sequences and gene features" if (up_to_date(dst_transcript_feature_file, transcript_feature_file) and up_to_date(transcript_fasta_file, dst_transcript_feature_file) and up_to_date(multimapping_file, transcript_feature_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) # write sequences from gene feature file logging.info("Adding transcript sequences") fasta_fh = open(transcript_fasta_file, "w") tx_fh = open(dst_transcript_feature_file, "w") chrom_transcript_dict = collections.defaultdict(lambda: []) for t, fa_record in transcript_features_to_fasta( transcript_feature_file, dst_genome_fasta_file): print >> tx_fh, str(t) print >> fasta_fh, fa_record chrom_transcript_dict[t.chrom].append(t) tx_fh.close() fasta_fh.close() # find maximum transcript overlap as this informs alignment # parameters controlling multi-mapping read handling max_overlap = 0 for chrom, transcripts in chrom_transcript_dict.iteritems(): overlap = find_maximum_feature_overlap(transcripts) max_overlap = max(max_overlap, overlap) logging.info("Maximum transcript overlap is %d" % (max_overlap)) fh = open(multimapping_file, "w") print >> fh, max_overlap fh.close() # index the transcript fasta file logging.info("Indexing the Transcriptome FASTA file") fh = pysam.Fastafile(transcript_fasta_file) fh.close() # # Build Transcriptome alignment index # skip = True index_files = (os.path.join(output_dir, f) for f in config.TRANSCRIPTOME_BOWTIE2_FILES) for f in index_files: skip = skip and up_to_date(f, transcript_fasta_file) msg = "Building transcriptome index" if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, config.TRANSCRIPTOME_INDEX) args = [ config.BOWTIE2_BUILD_BIN, transcript_fasta_file, bowtie_index_name ] if subprocess.call(args) != os.EX_OK: logging.error("Failed to create alignment index") for f in index_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Build Genome alignment index # skip = True index_files = (os.path.join(output_dir, f) for f in config.GENOME_BOWTIE2_FILES) for f in index_files: skip = skip and up_to_date(f, dst_genome_fasta_file) msg = "Building genome index" if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, config.GENOME_INDEX) args = [ config.BOWTIE2_BUILD_BIN, dst_genome_fasta_file, bowtie_index_name ] if subprocess.call(args) != os.EX_OK: logging.error("Failed to create alignment index") for f in index_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR logging.info("Chimerascan index created successfully") return config.JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file, bowtie_build_bin): # min_fragment_size, # max_fragment_size): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # copy reference fasta file to output dir index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa") if (up_to_date(index_fasta_file, genome_fasta_file) and up_to_date(index_fasta_file, gene_feature_file)): logging.info("[SKIPPED] Adding reference genome to index") else: logging.info("Adding reference genome to index") shutil.copyfile(genome_fasta_file, index_fasta_file) # index the genome fasta file logging.info("Indexing FASTA file") fh = pysam.Fastafile(index_fasta_file) fh.close() # append sequences from gene feature file logging.info("Adding transcript sequences to index...") fh = open(index_fasta_file, "a") for fa_record in bed12_to_fasta(gene_feature_file, index_fasta_file): print >>fh, fa_record fh.close() # remove old fasta index os.remove(index_fasta_file + ".fai") # re-index the combined fasta file logging.info("Re-indexing FASTA file...") fh = pysam.Fastafile(index_fasta_file) fh.close() # build bowtie index on the reference sequence file bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE) msg = "Building bowtie index" if up_to_date(bowtie_index_file, index_fasta_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX) args = [bowtie_build_bin, index_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("bowtie-build failed to create alignment index") if os.path.exists(bowtie_index_file): os.remove(bowtie_index_file) return JOB_ERROR # copy gene bed file to index directory dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE) if up_to_date(dst_gene_feature_file, gene_feature_file): logging.info("[SKIPPED] Adding transcript features to index...") else: logging.info("Adding transcript features to index...") shutil.copyfile(gene_feature_file, dst_gene_feature_file) # create tophat junctions file from gene features # juncs_file = os.path.join(output_dir, TOPHAT_JUNCS_FILE) # if up_to_date(juncs_file, dst_gene_feature_file): # logging.info("[SKIPPED] Creating splice junction file...") # else: # logging.info("Creating splice junction file...") # fh = open(juncs_file, "w") # for junc_line in create_tophat_juncs_file(output_dir, gene_feature_file): # print >>fh, junc_line # fh.close() # build special index used to discover the fragment size # frag_size_index_file = os.path.join(output_dir, FRAG_SIZE_INDEX_FILE) # if up_to_date(frag_size_index_file, index_fasta_file): # logging.info("[SKIPPED] Building fragment size distribution index") # else: # logging.info("Building fragment size distribution index") # retcode = create_fragment_size_index(output_dir, gene_feature_file, # genome_fasta_file, # bowtie_build_bin, # max_fragment_size) # if retcode != os.EX_OK: # logging.error("bowtie-build failed to create fragment size " # "distribution index") # if os.path.exists(frag_size_index_file): # os.remove(frag_size_index_file) # return JOB_ERROR logging.info("chimerascan index created successfully") return JOB_SUCCESS
def run_chimerascan(runconfig): """ main function for running the chimerascan pipeline """ # print a welcome message title_string = "Running chimerascan version %s" % (__version__) logging.info(title_string) logging.info("-" * len(title_string)) # validate run configuration config_passed = runconfig.check_config() if not config_passed: logging.error("Invalid run configuration, aborting.") return config.JOB_ERROR # create output dir if it does not exist if not os.path.exists(runconfig.output_dir): os.makedirs(runconfig.output_dir) logging.info("Created output directory: %s" % (runconfig.output_dir)) # create log dir if it does not exist log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR) if not os.path.exists(log_dir): os.makedirs(log_dir) logging.debug("Created directory for log files: %s" % (log_dir)) # create tmp dir if it does not exist tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR) if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) logging.debug("Created directory for tmp files: %s" % (tmp_dir)) # write the run config to a file xmlstring = runconfig.to_xml() runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE) logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file)) fh = open(runconfig_xml_file, "w") print >>fh, xmlstring fh.close() # mask biotypes and references mask_biotypes = set() if runconfig.mask_biotypes_file: logging.info("Reading biotypes mask file") mask_biotypes.update([line.strip() for line in open(runconfig.mask_biotypes_file)]) logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes)))) mask_rnames = set() if runconfig.mask_rnames_file: logging.info("Reading references mask file") mask_rnames.update([line.strip() for line in open(runconfig.mask_rnames_file)]) logging.info("\tread references: %s" % (','.join(sorted(mask_rnames)))) # read transcripts logging.info("Reading transcript features") transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.info("\tread %d transcripts" % (len(transcripts))) # setup alignment indexes genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX) transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX) max_transcriptome_hits_file = os.path.join(runconfig.index_dir, config.MAX_MULTIMAPPING_FILE) max_transcriptome_hits = int(open(max_transcriptome_hits_file).next().strip()) # detect read length original_read_length = detect_read_length(runconfig.fastq_files[0]) # minimum fragment length cannot be smaller than the trimmed read length trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3) min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length) # # Process and inspect the FASTQ files, performing several alterations # to the reads: # # 1) rename them from long string to numbers to save space throughout # the pipeline. also store mapping from read numbers to full names # in a separate file # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads # 3) convert quality scores to sanger format # converted_fastq_files = [os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES] read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE) msg = "Processing FASTQ files" skip = all(up_to_date(cfq, fq) for cfq,fq in zip(converted_fastq_files, runconfig.fastq_files)) skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0]) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) converted_fastq_prefix = \ os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX) try: retcode = process_input_reads(runconfig.fastq_files, converted_fastq_prefix, quals=runconfig.quals, trim5=runconfig.trim5, trim3=runconfig.trim3) if retcode != config.JOB_SUCCESS: logging.error("%s step failed" % (msg)) return config.JOB_ERROR except Exception as e: logging.info("Cleaning up after error %s" % (str(e))) for fq in converted_fastq_files: if os.path.isfile(fq): os.remove(fq) # # Transcriptome alignment step # # Align to transcriptome in paired-end mode, trying to resolve as many # reads as possible. # transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE) transcriptome_unaligned_path = os.path.join(tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH) transcriptome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES) msg = "Aligning paired-end reads to transcriptome" if (all(up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and all(up_to_date(a,b) for a,b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE) retcode = bowtie2_align_transcriptome_pe(transcriptome_index=transcriptome_index, genome_index=genome_index, transcript_file=transcript_file, fastq_files=converted_fastq_files, unaligned_path=transcriptome_unaligned_path, bam_file=transcriptome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_transcriptome_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(transcriptome_bam_file): os.remove(transcriptome_bam_file) for f in transcriptome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Sort transcriptome reads by position # msg = "Sorting transcriptome reads" sorted_transcriptome_bam_file = os.path.join(runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE) if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) sorted_aligned_bam_prefix = os.path.splitext(sorted_transcriptome_bam_file)[0] pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix) # # Index BAM file # msg = "Indexing BAM file" sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai" if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_transcriptome_bam_file) # # Get insert size distribution # isize_dist_file = os.path.join(runconfig.output_dir, config.ISIZE_DIST_FILE) msg = "Profiling insert size distribution" if up_to_date(isize_dist_file, transcriptome_bam_file): logging.info("[SKIPPED] %s" % msg) isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file, "r")) else: logging.info(msg) bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb") isize_dist = InsertSizeDistribution.from_genome_bam(bamfh, transcripts, min_isize=min_fragment_length, max_isize=runconfig.max_fragment_length, max_samples=config.ISIZE_MAX_SAMPLES) bamfh.close() # if not enough samples, use a normal distribution instead # of the empirical distribution if isize_dist.n < config.ISIZE_MIN_SAMPLES: logging.warning("Not enough fragments to sample insert size " "distribution empirically. Using mean=%d " "stdev=%f instead" % (runconfig.isize_mean, runconfig.isize_stdev)) isize_dist = InsertSizeDistribution.from_random(runconfig.isize_mean, runconfig.isize_stdev, min_isize=runconfig.min_fragment_length, max_isize=runconfig.max_fragment_length, samples=config.ISIZE_MAX_SAMPLES) isize_dist.to_file(open(isize_dist_file, "w")) # # Determine ideal segment length automatically # # log insert size statistics logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % (isize_dist.n, isize_dist.mean(), isize_dist.std(), isize_dist.isize_at_percentile(50.0), isize_dist.mode())) # choose a segment length to optimize mapping optimal_isize = isize_dist.isize_at_percentile(DEFAULT_FRAG_SIZE_SENSITIVITY) logging.info("Determining soft-clipped segment length") logging.debug("\tInsert size at %f percent of distribution is %d" % (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize)) optimal_segment_length = int(round(optimal_isize / 3.0)) logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length)) segment_length = min(optimal_segment_length, trimmed_read_length) segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length) logging.debug("\tAfter adjusting for min %d and read length %d, final segment length is %d" % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length)) if runconfig.segment_length is not None: logging.debug("\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length)) segment_length = runconfig.segment_length # # Genome alignment step # # Align any unaligned transcriptome reads to genome in paired-end mode. # Resolve as many reads as possible. # genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE) genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH) genome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES) msg = "Realigning unaligned paired-end reads to genome" if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and all(up_to_date(a,b) for a,b in zip(genome_unaligned_fastq_files, converted_fastq_files))): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) log_file = os.path.join(log_dir, config.GENOME_LOG_FILE) retcode = bowtie2_align_pe(index=genome_index, fastq_files=transcriptome_unaligned_fastq_files, unaligned_path=genome_unaligned_path, bam_file=genome_bam_file, log_file=log_file, library_type=runconfig.library_type, min_fragment_length=min_fragment_length, max_fragment_length=runconfig.max_fragment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) # cleanup if job failed if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(genome_bam_file): os.remove(genome_bam_file) for f in genome_unaligned_fastq_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Realignment step # # trim and realign all the initially unaligned reads in order to # increase sensitivity to detect reads spanning fusion junctions # realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE) realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE) msg = "Trimming and realigning initially unmapped reads" if (all(up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and up_to_date(realigned_bam_file, isize_dist_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = bowtie2_align_pe_sr(index=transcriptome_index, transcript_file=transcript_file, fastq_files=genome_unaligned_fastq_files, bam_file=realigned_bam_file, log_file=realigned_log_file, tmp_dir=tmp_dir, segment_length=segment_length, max_hits=max_transcriptome_hits, num_processors=runconfig.num_processors) if retcode != config.JOB_SUCCESS: if os.path.exists(realigned_bam_file): os.remove(realigned_bam_file) return config.JOB_ERROR # # Find discordant reads # # iterate through realigned reads and divide them into groups of # concordant, discordant within a gene (isoforms), discordant # between different genes, and discordant in the genome # paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE) discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE) unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE) unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE) multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE) unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE) output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file) msg = "Classifying concordant and discordant read pairs" if (all(up_to_date(f, realigned_bam_file) for f in output_files)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = find_discordant_fragments(transcripts=transcripts, input_bam_file=realigned_bam_file, paired_bam_file=paired_bam_file, discordant_bam_file=discordant_bam_file, unpaired_bam_file=unpaired_bam_file, unmapped_bam_file=unmapped_bam_file, multimap_bam_file=multimap_bam_file, unresolved_bam_file=unresolved_bam_file, max_isize=runconfig.max_fragment_length, max_multihits=runconfig.max_multihits, library_type=runconfig.library_type) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Convert discordant transcriptome reads to genome coordinates # discordant_genome_bam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_BAM_FILE) msg = "Converting discordant transcriptome hits to genomic coordinates" if (up_to_date(discordant_genome_bam_file, discordant_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) discordant_genome_sam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=discordant_bam_file, output_file=discordant_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(discordant_genome_bam_file): os.remove(discordant_genome_bam_file) return config.JOB_ERROR if os.path.exists(discordant_genome_sam_file): os.remove(discordant_genome_sam_file) # # Sort discordant reads by position # msg = "Sorting discordant BAM file" sorted_discordant_genome_bam_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE) if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing discordant BAM file" sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai" if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_discordant_genome_bam_file) # # Convert unpaired transcriptome reads to genome coordinates # unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE) msg = "Converting unpaired transcriptome hits to genomic coordinates" if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) unpaired_genome_sam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_SAM_FILE) retcode = transcriptome_to_genome(genome_index, transcripts, input_file=unpaired_bam_file, output_file=unpaired_genome_sam_file, library_type=runconfig.library_type, input_sam=False, output_sam=True) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) return config.JOB_ERROR retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unpaired_genome_bam_file): os.remove(unpaired_genome_bam_file) return config.JOB_ERROR if os.path.exists(unpaired_genome_sam_file): os.remove(unpaired_genome_sam_file) # # Sort unpaired reads by position # msg = "Sorting unpaired BAM file" sorted_unpaired_genome_bam_file = os.path.join(tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE) if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0] pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing unpaired BAM file" sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai" if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_unpaired_genome_bam_file) # # Cluster discordant reads into chimera candidates # cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE) cluster_shelve_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE) sorted_discordant_genome_cluster_bam_file = \ os.path.join(runconfig.output_dir, config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE) input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file) output_files = (cluster_file, cluster_shelve_file, sorted_discordant_genome_cluster_bam_file) msg = "Clustering discordant reads" skip = True for input_file in input_files: for output_file in output_files: skip = skip and up_to_date(output_file, input_file) if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = cluster_discordant_reads(discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, concordant_bam_file=sorted_transcriptome_bam_file, output_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_file=cluster_file, cluster_shelve_file=cluster_shelve_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Pair discordant clusters # cluster_pair_file = \ os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE) msg = "Pairing discordant clusters" output_files = (cluster_pair_file,) if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = pair_discordant_clusters(discordant_bam_file=sorted_discordant_genome_cluster_bam_file, cluster_pair_file=cluster_pair_file, tmp_dir=tmp_dir) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Perform realignment across putative fusion breakpoints # breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE) msg = "Realigning to find breakpoint-spanning reads" input_files = (sorted_discordant_genome_bam_file, sorted_unpaired_genome_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (breakpoint_bam_file,) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.debug(msg) retcode = realign_across_breakpoints(index_dir=runconfig.index_dir, discordant_bam_file=sorted_discordant_genome_bam_file, unpaired_bam_file=sorted_unpaired_genome_bam_file, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, breakpoint_bam_file=breakpoint_bam_file, log_dir=log_dir, tmp_dir=tmp_dir, num_processors=runconfig.num_processors, local_anchor_length=runconfig.local_anchor_length, local_multihits=runconfig.local_multihits) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) # # Nominate breakpoint spanning reads (split reads) # spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE) spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE) spanning_cluster_pair_file = os.path.join(tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE) msg = "Processing breakpoint-spanning alignments" input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file) output_files = (spanning_bam_file, spanning_cluster_pair_file) skip = True for inp in input_files: for outp in output_files: if not up_to_date(outp, inp): skip = False if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = process_spanning_alignments(cluster_shelve_file=cluster_shelve_file, cluster_pair_file=cluster_pair_file, bam_file=breakpoint_bam_file, output_sam_file=spanning_sam_file, output_cluster_pair_file=spanning_cluster_pair_file, local_anchor_length=runconfig.local_anchor_length) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) for f in output_files: if os.path.exists(f): os.remove(f) retcode = sam_to_bam(spanning_sam_file, spanning_bam_file) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(spanning_bam_file): os.remove(spanning_bam_file) return config.JOB_ERROR if os.path.exists(spanning_sam_file): os.remove(spanning_sam_file) # # Sort unpaired reads by position # msg = "Sorting spanning BAM file" sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE) if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0] pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix) # # Index BAM file # msg = "Indexing spanning BAM file" sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai" if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) pysam.index(sorted_spanning_bam_file) # # Write chimera file # unfiltered_chimera_bedpe_file = os.path.join(runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE) msg = "Writing unfiltered chimeras to file %s" % (unfiltered_chimera_bedpe_file) if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = write_output(transcripts, cluster_shelve_file=cluster_shelve_file, cluster_pair_file=spanning_cluster_pair_file, read_name_file=read_name_file, output_file=unfiltered_chimera_bedpe_file, annotation_source="ensembl") if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(unfiltered_chimera_bedpe_file): os.remove(unfiltered_chimera_bedpe_file) # # Filter chimeras # chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE) msg = "Filtering chimeras" if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) retcode = filter_chimeras(input_file=unfiltered_chimera_bedpe_file, output_file=chimera_bedpe_file, filter_num_frags=runconfig.filter_num_frags, filter_allele_fraction=runconfig.filter_allele_fraction, mask_biotypes=mask_biotypes, mask_rnames=mask_rnames) if retcode != config.JOB_SUCCESS: logging.error("[FAILED] %s" % (msg)) if os.path.exists(chimera_bedpe_file): os.remove(chimera_bedpe_file) # # Cleanup # if not runconfig.keep_tmp: logging.info("Cleaning up temporary files") shutil.rmtree(tmp_dir) # # Done # logging.info("Finished run.") return config.JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, gene_feature_file, bowtie_build_bin): # min_fragment_size, # max_fragment_size): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # copy reference fasta file to output dir index_fasta_file = os.path.join(output_dir, ALIGN_INDEX + ".fa") if (up_to_date(index_fasta_file, genome_fasta_file) and up_to_date(index_fasta_file, gene_feature_file)): logging.info("[SKIPPED] Adding reference genome to index") else: logging.info("Adding reference genome to index") shutil.copyfile(genome_fasta_file, index_fasta_file) # index the genome fasta file logging.info("Indexing FASTA file") fh = pysam.Fastafile(index_fasta_file) fh.close() # append sequences from gene feature file logging.info("Adding transcript sequences to index...") fh = open(index_fasta_file, "a") for fa_record in bed12_to_fasta(gene_feature_file, index_fasta_file): print >> fh, fa_record fh.close() # remove old fasta index os.remove(index_fasta_file + ".fai") # re-index the combined fasta file logging.info("Re-indexing FASTA file...") fh = pysam.Fastafile(index_fasta_file) fh.close() # build bowtie index on the reference sequence file bowtie_index_file = os.path.join(output_dir, BOWTIE_INDEX_FILE) msg = "Building bowtie index" if up_to_date(bowtie_index_file, index_fasta_file): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, ALIGN_INDEX) args = [bowtie_build_bin, index_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("bowtie-build failed to create alignment index") if os.path.exists(bowtie_index_file): os.remove(bowtie_index_file) return JOB_ERROR # copy gene bed file to index directory dst_gene_feature_file = os.path.join(output_dir, GENE_FEATURE_FILE) if up_to_date(dst_gene_feature_file, gene_feature_file): logging.info("[SKIPPED] Adding transcript features to index...") else: logging.info("Adding transcript features to index...") shutil.copyfile(gene_feature_file, dst_gene_feature_file) # create tophat junctions file from gene features # juncs_file = os.path.join(output_dir, TOPHAT_JUNCS_FILE) # if up_to_date(juncs_file, dst_gene_feature_file): # logging.info("[SKIPPED] Creating splice junction file...") # else: # logging.info("Creating splice junction file...") # fh = open(juncs_file, "w") # for junc_line in create_tophat_juncs_file(output_dir, gene_feature_file): # print >>fh, junc_line # fh.close() # build special index used to discover the fragment size # frag_size_index_file = os.path.join(output_dir, FRAG_SIZE_INDEX_FILE) # if up_to_date(frag_size_index_file, index_fasta_file): # logging.info("[SKIPPED] Building fragment size distribution index") # else: # logging.info("Building fragment size distribution index") # retcode = create_fragment_size_index(output_dir, gene_feature_file, # genome_fasta_file, # bowtie_build_bin, # max_fragment_size) # if retcode != os.EX_OK: # logging.error("bowtie-build failed to create fragment size " # "distribution index") # if os.path.exists(frag_size_index_file): # os.remove(frag_size_index_file) # return JOB_ERROR logging.info("chimerascan index created successfully") return JOB_SUCCESS
def create_chimerascan_index(output_dir, genome_fasta_file, transcript_feature_file): # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Created index directory: %s" % (output_dir)) # copy reference fasta file to output dir and index it dst_genome_fasta_file = os.path.join(output_dir, config.GENOME_FASTA_FILE) msg = "Adding reference genome" if (up_to_date(dst_genome_fasta_file, genome_fasta_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) shutil.copyfile(genome_fasta_file, dst_genome_fasta_file) # index the genome fasta file logging.info("Indexing FASTA file") fh = pysam.Fastafile(dst_genome_fasta_file) fh.close() # add gene sequences to index dst_transcript_feature_file = os.path.join(output_dir, config.TRANSCRIPT_FEATURE_FILE) transcript_fasta_file = os.path.join(output_dir, config.TRANSCRIPTOME_FASTA_FILE) multimapping_file = os.path.join(output_dir, config.MAX_MULTIMAPPING_FILE) msg = "Building transcriptome sequences and gene features" if (up_to_date(dst_transcript_feature_file, transcript_feature_file) and up_to_date(transcript_fasta_file, dst_transcript_feature_file) and up_to_date(multimapping_file, transcript_feature_file)): logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) # write sequences from gene feature file logging.info("Adding transcript sequences") fasta_fh = open(transcript_fasta_file, "w") tx_fh = open(dst_transcript_feature_file, "w") chrom_transcript_dict = collections.defaultdict(lambda: []) for t, fa_record in transcript_features_to_fasta(transcript_feature_file, dst_genome_fasta_file): print >>tx_fh, str(t) print >>fasta_fh, fa_record chrom_transcript_dict[t.chrom].append(t) tx_fh.close() fasta_fh.close() # find maximum transcript overlap as this informs alignment # parameters controlling multi-mapping read handling max_overlap = 0 for chrom, transcripts in chrom_transcript_dict.iteritems(): overlap = find_maximum_feature_overlap(transcripts) max_overlap = max(max_overlap, overlap) logging.info("Maximum transcript overlap is %d" % (max_overlap)) fh = open(multimapping_file, "w") print >>fh, max_overlap fh.close() # index the transcript fasta file logging.info("Indexing the Transcriptome FASTA file") fh = pysam.Fastafile(transcript_fasta_file) fh.close() # # Build Transcriptome alignment index # skip = True index_files = (os.path.join(output_dir, f) for f in config.TRANSCRIPTOME_BOWTIE2_FILES) for f in index_files: skip = skip and up_to_date(f, transcript_fasta_file) msg = "Building transcriptome index" if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, config.TRANSCRIPTOME_INDEX) args = [config.BOWTIE2_BUILD_BIN, transcript_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("Failed to create alignment index") for f in index_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR # # Build Genome alignment index # skip = True index_files = (os.path.join(output_dir, f) for f in config.GENOME_BOWTIE2_FILES) for f in index_files: skip = skip and up_to_date(f, dst_genome_fasta_file) msg = "Building genome index" if skip: logging.info("[SKIPPED] %s" % (msg)) else: logging.info(msg) bowtie_index_name = os.path.join(output_dir, config.GENOME_INDEX) args = [config.BOWTIE2_BUILD_BIN, dst_genome_fasta_file, bowtie_index_name] if subprocess.call(args) != os.EX_OK: logging.error("Failed to create alignment index") for f in index_files: if os.path.exists(f): os.remove(f) return config.JOB_ERROR logging.info("Chimerascan index created successfully") return config.JOB_SUCCESS