def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, 
                      trim_bp, max_read_length, homology_mismatches):
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)    
    tx_name_gene_map = build_tx_name_gene_map(gene_file, rname_prefix=None)
    #genome_tx_trees = build_genome_tx_trees(gene_file)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # keep track of mapping from breakpoint sequence to breakpoint id
    # this requires storing all breakpoint sequences in memory which is
    # potentially expensive.  TODO: investigate whether this should be
    # moved to a separate sort-update-sort procedure
    breakpoint_seq_name_map = {}
    breakpoint_num = 1
    # group discordant read pairs by gene
    logging.debug("Parsing discordant reads")
    chimera_num = 1
    outfh = open(output_file, "w")    
    for tx_name_5p, tx_name_3p, frags in parse_discordant_bedpe_by_transcript_pair(open(input_file)):
        # get gene information
        tx5p = tx_name_gene_map[tx_name_5p]
        tx3p = tx_name_gene_map[tx_name_3p]
        # bin fragments into putative breakpoints
        breakpoint_dict = collections.defaultdict(lambda: [])
        for dr5p,dr3p in frags:
            # given the insert size find the highest probability 
            # exon junction breakpoint between the two transcripts
            isize_prob, breakpoints = \
                choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, 
                                        trim_bp, isize_dist)
            for breakpoint in breakpoints:
                breakpoint_dict[breakpoint].append((dr5p, dr3p))        
        # iterate through breakpoints and build chimera candidates
        for breakpoint,frags in breakpoint_dict.iteritems():          
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)                
            tx3p_length = sum((end - start) for start,end in tx3p.exons)
            # get unique breakpoint id based on sequence
            breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p
            if breakpoint_seq in breakpoint_seq_name_map:
                breakpoint_name = breakpoint_seq_name_map[breakpoint_seq]
            else:
                breakpoint_name = "B%07d" % (breakpoint_num)
                breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name
                breakpoint_num += 1
            # write gene, breakpoint, and raw reads to a file and follow the
            # BEDPE format
            gene_name_5p = '_'.join(tx5p.gene_name.split())
            gene_name_3p = '_'.join(tx3p.gene_name.split())
            fields = [tx5p.tx_name, 0, tx_end_5p,  # chrom1, start1, end1
                      tx3p.tx_name, tx_start_3p, tx3p_length, # chrom2, start2, end2
                      "C%07d" % (chimera_num), # name
                      1.0, # pvalue
                      tx5p.strand, tx3p.strand, # strand1, strand2
                      gene_name_5p, gene_name_3p, # gene names
                      # exon interval information
                      '%d-%d' % (0, exon_num_5p),
                      '%d-%d' % (exon_num_3p, len(tx3p.exons)),
                      # breakpoint information
                      breakpoint_name, 
                      breakpoint_seq_5p, breakpoint_seq_3p, 
                      homology_left, homology_right, 
                      # fragments
                      frags_to_encomp_string(frags),
                      # spanning reads
                      None]
            print >>outfh, '\t'.join(map(str, fields))
            chimera_num += 1
    outfh.close()
    ref_fa.close()
    return config.JOB_SUCCESS
def write_output(input_file, bam_file, output_file, index_dir):
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    transcript_genome_map = build_transcript_genome_map(open(gene_file))    
    tx_name_gene_map = build_tx_name_gene_map(gene_file)    
    genome_tx_trees = build_genome_tx_trees(gene_file)
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")   
    # group chimera isoforms together
    lines = []
    chimera_clusters = 0
    for key,chimeras in get_chimera_groups(input_file, gene_file):
        txs5p = set()
        txs3p = set()
        genes5p = set()
        genes3p = set()
        names = set()
        for c in chimeras:
            txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1))
            txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1))
            genes5p.add(c.gene_name_5p)
            genes3p.add(c.gene_name_3p)
            names.add(c.name)
        c = get_best_coverage_chimera(chimeras)
        # get chimera type and distance between genes
        chimera_type, distance = get_chimera_type(tx_name_gene_map[c.tx_name_5p],
                                                  tx_name_gene_map[c.tx_name_3p],
                                                  genome_tx_trees)
        # get genomic positions of chimera
        chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map)
        chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)
        if strand5p == 1:
            start5p,end5p = end5p,start5p
        chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)
        chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map)
        if strand3p == 1:
            start3p,end3p = end3p,start3p
        # get breakpoint spanning sequences
        spanning_seqs = set()
        spanning_fasta_lines = []
        for dr in c.get_spanning_reads():
            if dr.seq in spanning_seqs:
                continue
            spanning_seqs.add(dr.seq)
            spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % 
                                         (dr.qname, dr.readnum+1, dr.pos, 
                                          "-" if dr.is_reverse else "+"), 
                                         dr.seq])
        # get isoform fraction
        num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh)
        num_chimeric_frags = c.get_num_frags()
        frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p)
        frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p)
        # setup fields of BEDPE file
        fields = [chrom5p, start5p, end5p,
                  chrom3p, start3p, end3p,
                  "CLUSTER%d" % (chimera_clusters),
                  c.get_num_frags(),
                  "+" if (strand5p == 0) else "-",
                  "+" if (strand3p == 0) else "-",
                  ','.join(txs5p),
                  ','.join(txs3p),
                  ','.join(genes5p),
                  ','.join(genes3p),
                  chimera_type, distance,
                  c.get_num_frags(),
                  c.get_num_spanning_frags(),
                  c.get_num_unique_positions(),
                  frac5p, frac3p,
                  ','.join(spanning_fasta_lines),
                  ','.join(names)]
        lines.append(fields)
        chimera_clusters += 1
    bamfh.close()
    logging.debug("Clustered chimeras: %d" % (chimera_clusters))
    # sort
    lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True)    
    f = open(output_file, "w")
    print >>f, '\t'.join(['#chrom5p', 'start5p', 'end5p', 
                          'chrom3p', 'start3p', 'end3p',
                          'chimera_cluster_id', 'score', 
                          'strand5p', 'strand3p',
                          'transcript_ids_5p', 'transcript_ids_3p',
                          'genes5p', 'genes3p',
                          'type', 'distance',
                          'total_frags', 
                          'spanning_frags',
                          'unique_alignment_positions',
                          'isoform_fraction_5p',
                          'isoform_fraction_3p',
                          'breakpoint_spanning_reads',
                          'chimera_ids'])
    for fields in lines:
        print >>f, '\t'.join(map(str, fields))
    f.close()
    return config.JOB_SUCCESS