def main():
    from optparse import OptionParser
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>")
    parser.add_option('--min-fragment-length', dest="min_fragment_length", 
                      type="int", default=0)
    parser.add_option('--max-fragment-length', dest="max_fragment_length", 
                      type="int", default=1000)
    parser.add_option('--max-samples', dest="max_samples", 
                      type="int", default=None)
    parser.add_option('-o', dest="output_file", default=None) 
    options, args = parser.parse_args()
    input_bam_file = args[0]
    bamfh = pysam.Samfile(input_bam_file, "rb")
    isizedist = InsertSizeDistribution.from_bam(bamfh, options.min_fragment_length, 
                                                options.max_fragment_length, 
                                                options.max_samples)
    bamfh.close()
    if options.output_file is not None:
        f = open(options.output_file, "w")
    else:
        f = sys.stdout
    isizedist.to_file(f)
    if options.output_file is not None:
        f.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % 
                 (isizedist.n, isizedist.mean(), isizedist.std(), 
                  isizedist.percentile(50.0), isizedist.mode()))
def main():
    from optparse import OptionParser
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <in.txt> <out.txt> <isizedist.txt>")
    parser.add_option("--min-isize-prob", dest="min_isize_prob", 
                      type="float", default=0.01)
    options, args = parser.parse_args()
    input_file = args[0]
    output_file = args[1]
    isize_dist_file = args[2]
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    resolve_discordant_reads(input_file, output_file, isize_dist, 
                             options.min_isize_prob,
                             tmp_dir=".")
def main():
    import sys
    calc_chimera_pvalues(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4]))
    return
    from optparse import OptionParser
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <in.txt> <bowtie.log> <isizedist.txt>")
    parser.add_option("--min-isize-prob", dest="min_isize_prob", 
                      type="float", default=0.01)
    options, args = parser.parse_args()
    input_file = args[0]
    bowtie_log_file = args[1]
    isize_dist_file = args[2]
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    calc_percent_discordant_reads(input_file, bowtie_log_file, isize_dist,
                                  min_isize_prob=options.min_isize_prob, 
                                  tmp_dir=".")
示例#4
0
def main():
    from optparse import OptionParser
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser(
        "usage: %prog [options] <in.txt> <out.txt> <isizedist.txt>")
    parser.add_option("--min-isize-prob",
                      dest="min_isize_prob",
                      type="float",
                      default=0.01)
    options, args = parser.parse_args()
    input_file = args[0]
    output_file = args[1]
    isize_dist_file = args[2]
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    resolve_discordant_reads(input_file,
                             output_file,
                             isize_dist,
                             options.min_isize_prob,
                             tmp_dir=".")
示例#5
0
def main():
    from optparse import OptionParser
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>")
    parser.add_option('--min-fragment-length',
                      dest="min_fragment_length",
                      type="int",
                      default=0)
    parser.add_option('--max-fragment-length',
                      dest="max_fragment_length",
                      type="int",
                      default=1000)
    parser.add_option('--max-samples',
                      dest="max_samples",
                      type="int",
                      default=None)
    parser.add_option('-o', dest="output_file", default=None)
    options, args = parser.parse_args()
    input_bam_file = args[0]
    bamfh = pysam.Samfile(input_bam_file, "rb")
    isizedist = InsertSizeDistribution.from_bam(bamfh,
                                                options.min_fragment_length,
                                                options.max_fragment_length,
                                                options.max_samples)
    bamfh.close()
    if options.output_file is not None:
        f = open(options.output_file, "w")
    else:
        f = sys.stdout
    isizedist.to_file(f)
    if options.output_file is not None:
        f.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isizedist.n, isizedist.mean(), isizedist.std(),
                  isizedist.percentile(50.0), isizedist.mode()))
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file,
                                    output_file, trim_bp, max_read_length,
                                    homology_mismatches):
    """
    homology_mismatches: number of mismatches to tolerate while computing
    homology between chimeric breakpoint sequence and "wildtype" sequence
    
    trim_bp: when selecting the best matching exon for each read, we
    account for spurious overlap into adjacent exons by trimming the
    read by 'trim_bp'
    """
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_tx_map = build_tid_tx_map(bamfh,
                                  gene_file,
                                  rname_prefix=config.GENE_REF_PREFIX)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # iterate through read pairs
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for r5p, r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # given the insert size find the highest probability
        # exon junction breakpoint between the two transcripts
        isize_prob, breakpoints = \
            choose_best_breakpoints(r5p, r3p, tx5p, tx3p,
                                    trim_bp, isize_dist)
        # extract the sequence of the breakpoint along with the
        # number of homologous bases at the breakpoint between
        # chimera and wildtype genes
        for breakpoint in breakpoints:
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            # write breakpoint information for each read to a file
            fields = [
                tx5p.tx_name,
                0,
                tx_end_5p,
                tx3p.tx_name,
                tx_start_3p,
                tx3p.tx_end,
                r5p.rname,  # name
                isize_prob,  # score
                tx5p.strand,
                tx3p.strand,  # strand 1, strand 2
                # user defined fields
                exon_num_5p,
                exon_num_3p,
                breakpoint_seq_5p,
                breakpoint_seq_3p,
                homology_left,
                homology_right
            ]
            fields.append('|'.join(map(str, dr5p.to_list())))
            fields.append('|'.join(map(str, dr3p.to_list())))
            print >> outfh, '\t'.join(map(str, fields))
    # cleanup
    ref_fa.close()
    outfh.close()
    bamfh.close()
    return config.JOB_SUCCESS
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" %
                 (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update(
            [line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update(
            [line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir,
                                   config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir,
                                       config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir,
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(
        open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 -
                           runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Process and inspect the FASTQ files, performing several alterations
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    #
    converted_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES
    ]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(
        up_to_date(cfq, fq)
        for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files,
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir,
                                          config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(
        tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(
            up_to_date(transcriptome_bam_file, fq)
            for fq in converted_fastq_files) and all(
                up_to_date(a, b)
                for a, b in zip(transcriptome_unaligned_fastq_files,
                                converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(
            transcriptome_index=transcriptome_index,
            genome_index=genome_index,
            transcript_file=transcript_file,
            fastq_files=converted_fastq_files,
            unaligned_path=transcriptome_unaligned_path,
            bam_file=transcriptome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_transcriptome_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(
        runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(
            sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file,
                   sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file,
                   sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(
            open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(
            bamfh,
            transcripts,
            min_isize=min_fragment_length,
            max_isize=runconfig.max_fragment_length,
            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" %
                            (runconfig.isize_mean, runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(
                runconfig.isize_mean,
                runconfig.isize_stdev,
                min_isize=runconfig.min_fragment_length,
                max_isize=runconfig.max_fragment_length,
                samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(
        DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" %
                  (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" %
                  (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug(
        "\tAfter adjusting for min %d and read length %d, final segment length is %d"
        % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug(
            "\tOverriding auto segment length and using segment length of %d" %
            (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files)
            and all(
                up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files,
                                                 converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(
            index=genome_index,
            fastq_files=transcriptome_unaligned_fastq_files,
            unaligned_path=genome_unaligned_path,
            bam_file=genome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(
            up_to_date(realigned_bam_file, fq)
            for fq in genome_unaligned_fastq_files)
            and up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(
            transcripts=transcripts,
            input_bam_file=realigned_bam_file,
            paired_bam_file=paired_bam_file,
            discordant_bam_file=discordant_bam_file,
            unpaired_bam_file=unpaired_bam_file,
            unmapped_bam_file=unmapped_bam_file,
            multimap_bam_file=multimap_bam_file,
            unresolved_bam_file=unresolved_bam_file,
            max_isize=runconfig.max_fragment_length,
            max_multihits=runconfig.max_multihits,
            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(
        tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        discordant_genome_sam_file = os.path.join(
            tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(
            genome_index,
            transcripts,
            input_file=discordant_bam_file,
            output_file=discordant_genome_sam_file,
            library_type=runconfig.library_type,
            input_sam=False,
            output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file,
                             discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file,
                   discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file,
                   sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir,
                                            config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        unpaired_genome_sam_file = os.path.join(
            tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index,
                                          transcripts,
                                          input_file=unpaired_bam_file,
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file,
                             unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file,
                   sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir,
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            concordant_bam_file=sorted_transcriptome_bam_file,
            output_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_file=cluster_file,
            cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file, )
    if up_to_date(cluster_pair_file,
                  sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(
            discordant_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_pair_file=cluster_pair_file,
            tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file, cluster_shelve_file,
                   cluster_pair_file)
    output_files = (breakpoint_bam_file, )
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(
            index_dir=runconfig.index_dir,
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            breakpoint_bam_file=breakpoint_bam_file,
            log_dir=log_dir,
            tmp_dir=tmp_dir,
            num_processors=runconfig.num_processors,
            local_anchor_length=runconfig.local_anchor_length,
            local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(
        tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file)
    output_files = (spanning_bam_file, spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            bam_file=breakpoint_bam_file,
            output_sam_file=spanning_sam_file,
            output_cluster_pair_file=spanning_cluster_pair_file,
            local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir,
                                            config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    #
    unfiltered_chimera_bedpe_file = os.path.join(
        runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (
        unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file)
            and up_to_date(unfiltered_chimera_bedpe_file,
                           cluster_shelve_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts,
                               cluster_shelve_file=cluster_shelve_file,
                               cluster_pair_file=spanning_cluster_pair_file,
                               read_name_file=read_name_file,
                               output_file=unfiltered_chimera_bedpe_file,
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                      config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = filter_chimeras(
            input_file=unfiltered_chimera_bedpe_file,
            output_file=chimera_bedpe_file,
            filter_num_frags=runconfig.filter_num_frags,
            filter_allele_fraction=runconfig.filter_allele_fraction,
            mask_biotypes=mask_biotypes,
            mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    #
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run.")
    return config.JOB_SUCCESS
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, 
                      trim_bp, max_read_length, homology_mismatches):
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)    
    tx_name_gene_map = build_tx_name_gene_map(gene_file, rname_prefix=None)
    #genome_tx_trees = build_genome_tx_trees(gene_file)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # keep track of mapping from breakpoint sequence to breakpoint id
    # this requires storing all breakpoint sequences in memory which is
    # potentially expensive.  TODO: investigate whether this should be
    # moved to a separate sort-update-sort procedure
    breakpoint_seq_name_map = {}
    breakpoint_num = 1
    # group discordant read pairs by gene
    logging.debug("Parsing discordant reads")
    chimera_num = 1
    outfh = open(output_file, "w")    
    for tx_name_5p, tx_name_3p, frags in parse_discordant_bedpe_by_transcript_pair(open(input_file)):
        # get gene information
        tx5p = tx_name_gene_map[tx_name_5p]
        tx3p = tx_name_gene_map[tx_name_3p]
        # bin fragments into putative breakpoints
        breakpoint_dict = collections.defaultdict(lambda: [])
        for dr5p,dr3p in frags:
            # given the insert size find the highest probability 
            # exon junction breakpoint between the two transcripts
            isize_prob, breakpoints = \
                choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, 
                                        trim_bp, isize_dist)
            for breakpoint in breakpoints:
                breakpoint_dict[breakpoint].append((dr5p, dr3p))        
        # iterate through breakpoints and build chimera candidates
        for breakpoint,frags in breakpoint_dict.iteritems():          
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)                
            tx3p_length = sum((end - start) for start,end in tx3p.exons)
            # get unique breakpoint id based on sequence
            breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p
            if breakpoint_seq in breakpoint_seq_name_map:
                breakpoint_name = breakpoint_seq_name_map[breakpoint_seq]
            else:
                breakpoint_name = "B%07d" % (breakpoint_num)
                breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name
                breakpoint_num += 1
            # write gene, breakpoint, and raw reads to a file and follow the
            # BEDPE format
            gene_name_5p = '_'.join(tx5p.gene_name.split())
            gene_name_3p = '_'.join(tx3p.gene_name.split())
            fields = [tx5p.tx_name, 0, tx_end_5p,  # chrom1, start1, end1
                      tx3p.tx_name, tx_start_3p, tx3p_length, # chrom2, start2, end2
                      "C%07d" % (chimera_num), # name
                      1.0, # pvalue
                      tx5p.strand, tx3p.strand, # strand1, strand2
                      gene_name_5p, gene_name_3p, # gene names
                      # exon interval information
                      '%d-%d' % (0, exon_num_5p),
                      '%d-%d' % (exon_num_3p, len(tx3p.exons)),
                      # breakpoint information
                      breakpoint_name, 
                      breakpoint_seq_5p, breakpoint_seq_3p, 
                      homology_left, homology_right, 
                      # fragments
                      frags_to_encomp_string(frags),
                      # spanning reads
                      None]
            print >>outfh, '\t'.join(map(str, fields))
            chimera_num += 1
    outfh.close()
    ref_fa.close()
    return config.JOB_SUCCESS
示例#9
0
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))        
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >>fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update([line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update([line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir, 
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length)
    # 
    # Process and inspect the FASTQ files, performing several alterations 
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names 
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    # 
    converted_fastq_files = [os.path.join(tmp_dir, fq) 
                             for fq in config.CONVERTED_FASTQ_FILES]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(up_to_date(cfq, fq) for cfq,fq in 
               zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files, 
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many 
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and 
        all(up_to_date(a,b) for a,b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(transcriptome_index=transcriptome_index,
                                                 genome_index=genome_index,
                                                 transcript_file=transcript_file,     
                                                 fastq_files=converted_fastq_files,
                                                 unaligned_path=transcriptome_unaligned_path,
                                                 bam_file=transcriptome_bam_file,
                                                 log_file=log_file,
                                                 library_type=runconfig.library_type,
                                                 min_fragment_length=min_fragment_length,
                                                 max_fragment_length=runconfig.max_fragment_length,
                                                 max_transcriptome_hits=max_transcriptome_hits,
                                                 num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(runconfig.output_dir, 
                                                 config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir, 
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(bamfh, transcripts, 
                                                            min_isize=min_fragment_length, 
                                                            max_isize=runconfig.max_fragment_length, 
                                                            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" % 
                            (runconfig.isize_mean, 
                             runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(runconfig.isize_mean, 
                                                            runconfig.isize_stdev, 
                                                            min_isize=runconfig.min_fragment_length,
                                                            max_isize=runconfig.max_fragment_length,
                                                            samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % 
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(), 
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))    
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" % 
                 (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug("\tAfter adjusting for min %d and read length %d, final segment length is %d" % 
                 (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug("\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and 
        all(up_to_date(a,b) for a,b in zip(genome_unaligned_fastq_files, converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(index=genome_index,
                                   fastq_files=transcriptome_unaligned_fastq_files,
                                   unaligned_path=genome_unaligned_path,
                                   bam_file=genome_bam_file,
                                   log_file=log_file,
                                   library_type=runconfig.library_type,
                                   min_fragment_length=min_fragment_length,
                                   max_fragment_length=runconfig.max_fragment_length,
                                   max_hits=max_transcriptome_hits,
                                   num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and
        up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(transcripts=transcripts,
                                            input_bam_file=realigned_bam_file,
                                            paired_bam_file=paired_bam_file,
                                            discordant_bam_file=discordant_bam_file,
                                            unpaired_bam_file=unpaired_bam_file,
                                            unmapped_bam_file=unmapped_bam_file,
                                            multimap_bam_file=multimap_bam_file,
                                            unresolved_bam_file=unresolved_bam_file,
                                            max_isize=runconfig.max_fragment_length,
                                            max_multihits=runconfig.max_multihits,
                                            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)        
        discordant_genome_sam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index, transcripts, 
                                          input_file=discordant_bam_file, 
                                          output_file=discordant_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)        
        unpaired_genome_sam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index, transcripts, 
                                          input_file=unpaired_bam_file, 
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)        
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir, 
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file, 
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,                      
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(discordant_bam_file=sorted_discordant_genome_bam_file, 
                                           unpaired_bam_file=sorted_unpaired_genome_bam_file, 
                                           concordant_bam_file=sorted_transcriptome_bam_file, 
                                           output_bam_file=sorted_discordant_genome_cluster_bam_file, 
                                           cluster_file=cluster_file,
                                           cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file,)
    if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(discordant_bam_file=sorted_discordant_genome_cluster_bam_file, 
                                           cluster_pair_file=cluster_pair_file, 
                                           tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file, 
                   sorted_unpaired_genome_bam_file, 
                   cluster_shelve_file, 
                   cluster_pair_file)
    output_files = (breakpoint_bam_file,)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(index_dir=runconfig.index_dir,
                                             discordant_bam_file=sorted_discordant_genome_bam_file,
                                             unpaired_bam_file=sorted_unpaired_genome_bam_file,
                                             cluster_shelve_file=cluster_shelve_file,
                                             cluster_pair_file=cluster_pair_file,
                                             breakpoint_bam_file=breakpoint_bam_file,
                                             log_dir=log_dir,
                                             tmp_dir=tmp_dir,
                                             num_processors=runconfig.num_processors,
                                             local_anchor_length=runconfig.local_anchor_length,
                                             local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file,
                   cluster_shelve_file, 
                   cluster_pair_file)
    output_files = (spanning_bam_file,
                    spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(cluster_shelve_file=cluster_shelve_file,
                                              cluster_pair_file=cluster_pair_file,
                                              bam_file=breakpoint_bam_file,                                              
                                              output_sam_file=spanning_sam_file,
                                              output_cluster_pair_file=spanning_cluster_pair_file,
                                              local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    # 
    unfiltered_chimera_bedpe_file = os.path.join(runconfig.output_dir, 
                                                 config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and
        up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)):                
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts, 
                               cluster_shelve_file=cluster_shelve_file, 
                               cluster_pair_file=spanning_cluster_pair_file, 
                               read_name_file=read_name_file, 
                               output_file=unfiltered_chimera_bedpe_file, 
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:        
        logging.info(msg)
        retcode = filter_chimeras(input_file=unfiltered_chimera_bedpe_file, 
                                  output_file=chimera_bedpe_file,
                                  filter_num_frags=runconfig.filter_num_frags,
                                  filter_allele_fraction=runconfig.filter_allele_fraction,
                                  mask_biotypes=mask_biotypes,
                                  mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    # 
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    # 
    logging.info("Finished run.")
    return config.JOB_SUCCESS
示例#10
0
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file,
                      trim_bp, max_read_length, homology_mismatches):
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript information")
    transcript_feature_file = os.path.join(index_dir,
                                           config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_feature_file)))
    tx_id_map = build_transcript_map(transcripts)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE)
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # keep track of mapping from breakpoint sequence to breakpoint id
    # this requires storing all breakpoint sequences in memory which is
    # potentially expensive.  TODO: investigate whether this should be
    # moved to a separate sort-update-sort procedure
    breakpoint_seq_name_map = {}
    breakpoint_num = 1
    # group discordant read pairs by gene
    logging.debug("Parsing discordant reads")
    chimera_num = 1
    outfh = open(output_file, "w")
    for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair(
            open(input_file)):
        # get gene information
        tx5p = tx_id_map[tx_id_5p]
        tx3p = tx_id_map[tx_id_3p]
        # bin fragments into putative breakpoints
        breakpoint_dict = collections.defaultdict(lambda: [])
        for dr5p, dr3p in frags:
            # given the insert size find the highest probability
            # exon junction breakpoint between the two transcripts
            isize_prob, breakpoints = \
                choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p,
                                        trim_bp, isize_dist)
            for breakpoint in breakpoints:
                breakpoint_dict[breakpoint].append((dr5p, dr3p))
        # iterate through breakpoints and build chimera candidates
        for breakpoint, frags in breakpoint_dict.iteritems():
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(tx_id_5p, tx_end_5p,
                                            tx_id_3p, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            tx3p_length = sum((end - start) for start, end in tx3p.exons)
            # get unique breakpoint id based on sequence
            breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p
            if breakpoint_seq in breakpoint_seq_name_map:
                breakpoint_name = breakpoint_seq_name_map[breakpoint_seq]
            else:
                breakpoint_name = "B%07d" % (breakpoint_num)
                breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name
                breakpoint_num += 1
            # write gene, breakpoint, and raw reads to a file and follow the
            # BEDPE format
            gene_names_5p = ",".join(
                sorted(set(["_".join(x.split()) for x in tx5p.gene_names])))
            gene_names_3p = ",".join(
                sorted(set(["_".join(x.split()) for x in tx3p.gene_names])))
            fields = [
                tx5p.tx_id,
                0,
                tx_end_5p,  # chrom1, start1, end1
                tx3p.tx_id,
                tx_start_3p,
                tx3p_length,  # chrom2, start2, end2
                "C%07d" % (chimera_num),  # name
                1.0,  # pvalue
                tx5p.strand,
                tx3p.strand,  # strand1, strand2
                gene_names_5p,
                gene_names_3p,  # gene names
                # exon interval information
                '%d-%d' % (0, exon_num_5p),
                '%d-%d' % (exon_num_3p, len(tx3p.exons)),
                # breakpoint information
                breakpoint_name,
                breakpoint_seq_5p,
                breakpoint_seq_3p,
                homology_left,
                homology_right,
                # fragments
                frags_to_encomp_string(frags),
                # spanning reads
                None
            ]
            print >> outfh, '\t'.join(map(str, fields))
            chimera_num += 1
    outfh.close()
    ref_fa.close()
    return config.JOB_SUCCESS
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, 
                                    input_bam_file, output_file, 
                                    trim_bp, max_read_length,
                                    homology_mismatches):                      
    """
    homology_mismatches: number of mismatches to tolerate while computing
    homology between chimeric breakpoint sequence and "wildtype" sequence
    
    trim_bp: when selecting the best matching exon for each read, we
    account for spurious overlap into adjacent exons by trimming the
    read by 'trim_bp'
    """   
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_tx_map = build_tid_tx_map(bamfh, gene_file,
                                  rname_prefix=config.GENE_REF_PREFIX)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # iterate through read pairs
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # given the insert size find the highest probability 
        # exon junction breakpoint between the two transcripts
        isize_prob, breakpoints = \
            choose_best_breakpoints(r5p, r3p, tx5p, tx3p, 
                                    trim_bp, isize_dist)
        # extract the sequence of the breakpoint along with the
        # number of homologous bases at the breakpoint between 
        # chimera and wildtype genes
        for breakpoint in breakpoints:
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            # write breakpoint information for each read to a file
            fields = [tx5p.tx_name, 0, tx_end_5p,
                      tx3p.tx_name, tx_start_3p, tx3p.tx_end,
                      r5p.rname,  # name
                      isize_prob, # score
                      tx5p.strand, tx3p.strand, # strand 1, strand 2
                      # user defined fields
                      exon_num_5p, exon_num_3p,
                      breakpoint_seq_5p, breakpoint_seq_3p, 
                      homology_left, homology_right] 
            fields.append('|'.join(map(str, dr5p.to_list())))
            fields.append('|'.join(map(str, dr3p.to_list())))  
            print >>outfh, '\t'.join(map(str, fields))        
    # cleanup
    ref_fa.close()
    outfh.close()
    bamfh.close()
    return config.JOB_SUCCESS