Пример #1
0
def filter_chimeras(input_file, output_file, index_dir, bam_file,
                    weighted_unique_frags, median_isize, max_isize,
                    isoform_fraction, false_pos_file):
    logging.debug("Filtering Parameters")
    logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags))
    logging.debug("\tmedian insert size: %d" % (median_isize))
    logging.debug("\tmax insert size allowed: %d" % (max_isize))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Parsing false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Checking chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0
    tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt")
    f = open(tmp_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_weighted_frags(c, weighted_unique_frags)
        if not good:
            continue
        good = good and filter_inner_dist(c, max_isize)
        if not good:
            continue
        false_pos_key = (c.partner5p.tx_name, c.partner5p.end,
                         c.partner3p.tx_name, c.partner3p.start)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(
            c, isoform_fraction, median_isize, bamfh)
        if good:
            print >> f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    # find highest coverage chimeras among isoforms
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(tmp_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >> f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras)
    os.remove(tmp_file)
    return config.JOB_SUCCESS
Пример #2
0
def resolve_chimeric_reads(input_file, output_file, isize_dist):
    # gather statistics on read alignments and chimeras
    # that will be used to associate reads with chimeras
    read_chimera_dict = collections.defaultdict(lambda: [])
    chimera_stats_dict = {}
    for c in Chimera.parse(open(input_file)):
        # combine multimap hists
        mmap_hist = merge_mmap_hists(c.partner5p.multimap_hist,
                                     c.partner3p.multimap_hist)
        chimera_stats_dict[c.name] = (c.get_total_unique_reads(),
                                      c.get_unique_spanning_reads(), mmap_hist)
        # get statistics for reads
        for dpair in c.encomp_read_pairs:
            qname = dpair[0].qname
            # find insert size probability of read
            isize_prob = calc_isize_prob(dpair, c, isize_dist)
            # find number of mismatches
            mismatches = dpair[0].mismatches + dpair[1].mismatches
            read_chimera_dict[qname].append((c.name, isize_prob, mismatches))
    # now process one read at a time, looking at all its alignments and
    # choose the most likely set of alignments
    encomp_qnames_dict = collections.defaultdict(lambda: [])
    for qname, read_stats_list in read_chimera_dict.iteritems():
        stats_chimera_dict = collections.defaultdict(lambda: [])
        for chimera_name, isize_prob, mismatches in read_stats_list:
            # get chimera stats
            unique_reads, spanning_reads, mmap_hist = chimera_stats_dict[
                chimera_name]
            # make a key to sort on
            key = (unique_reads, spanning_reads) + tuple(mmap_hist) + (
                isize_prob, -mismatches)
            stats_chimera_dict[key].append(chimera_name)
        # sort keys (reverse)
        sorted_stats_keys = sorted(stats_chimera_dict.keys(), reverse=True)
        # use only the best key
        chimera_names = stats_chimera_dict[sorted_stats_keys[0]]
        for chimera_name in chimera_names:
            encomp_qnames_dict[chimera_name].add(qname)
    # now edit the chimeras using the modified qnames
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        qnames = encomp_qnames_dict[c.name]
        filtered_encomp_pairs = []
        for pair in c.encomp_read_pairs:
            if pair[0].qname not in qnames:
                continue
            filtered_encomp_pairs.append(pair)
        # update encompassing reads
        c.encomp_read_pairs = filtered_encomp_pairs

    f.close()
def resolve_chimeric_reads(input_file, output_file, isize_dist):
    # gather statistics on read alignments and chimeras
    # that will be used to associate reads with chimeras
    read_chimera_dict = collections.defaultdict(lambda: [])
    chimera_stats_dict = {}
    for c in Chimera.parse(open(input_file)):
        # combine multimap hists
        mmap_hist = merge_mmap_hists(c.partner5p.multimap_hist, 
                                     c.partner3p.multimap_hist)
        chimera_stats_dict[c.name] = (c.get_total_unique_reads(), 
                                      c.get_unique_spanning_reads(),
                                      mmap_hist)
        # get statistics for reads
        for dpair in c.encomp_read_pairs:
            qname = dpair[0].qname
            # find insert size probability of read
            isize_prob = calc_isize_prob(dpair, c, isize_dist)
            # find number of mismatches
            mismatches = dpair[0].mismatches + dpair[1].mismatches
            read_chimera_dict[qname].append((c.name, isize_prob, mismatches))
    # now process one read at a time, looking at all its alignments and
    # choose the most likely set of alignments
    encomp_qnames_dict = collections.defaultdict(lambda: [])
    for qname, read_stats_list in read_chimera_dict.iteritems():
        stats_chimera_dict = collections.defaultdict(lambda: [])
        for chimera_name, isize_prob, mismatches in read_stats_list:
            # get chimera stats
            unique_reads, spanning_reads, mmap_hist = chimera_stats_dict[chimera_name]
            # make a key to sort on
            key = (unique_reads, spanning_reads) + tuple(mmap_hist) + (isize_prob, -mismatches)
            stats_chimera_dict[key].append(chimera_name)
        # sort keys (reverse)
        sorted_stats_keys = sorted(stats_chimera_dict.keys(), reverse=True)
        # use only the best key
        chimera_names = stats_chimera_dict[sorted_stats_keys[0]]
        for chimera_name in chimera_names:
            encomp_qnames_dict[chimera_name].add(qname)
    # now edit the chimeras using the modified qnames
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        qnames = encomp_qnames_dict[c.name]
        filtered_encomp_pairs = []
        for pair in c.encomp_read_pairs:
            if pair[0].qname not in qnames:
                continue
            filtered_encomp_pairs.append(pair)
        # update encompassing reads
        c.encomp_read_pairs = filtered_encomp_pairs
        
    f.close()
Пример #4
0
def get_highest_coverage_isoforms(input_file, gene_file):
    # place overlapping chimeras into clusters
    logging.debug("Building isoform cluster lookup table")
    tx_cluster_map = build_tx_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    tx_genome_map = build_gene_to_genome_map(open(gene_file))
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        key = (c.name,
               c.get_num_unique_spanning_positions(),
               c.get_weighted_cov(),
               c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = tx_cluster_map[c.partner5p.tx_name]
        cluster3p = tx_cluster_map[c.partner3p.tx_name]
        # get genomic positions of breakpoints
        coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, tx_genome_map)
        coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, tx_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key)    
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
Пример #5
0
def get_highest_coverage_isoforms(input_file, gene_file):
    # place overlapping chimeras into clusters
    logging.debug("Building isoform cluster lookup table")
    tx_cluster_map = build_tx_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript
    # coordinates
    tx_genome_map = build_gene_to_genome_map(open(gene_file))
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        key = (c.name, c.get_num_unique_spanning_positions(),
               c.get_weighted_cov(), c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = tx_cluster_map[c.partner5p.tx_name]
        cluster3p = tx_cluster_map[c.partner3p.tx_name]
        # get genomic positions of breakpoints
        coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end - 1,
                                     tx_genome_map)
        coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start,
                                     tx_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p, cluster3p, coord5p,
                              coord3p)].append(key)
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
def main():
    from optparse import OptionParser
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <chimeras.txt> [<chimeras2.txt> <chimeras3.txt> ...]")
    parser.add_option("-o", dest="output_file", default=None,
                      help="output file [default=stdout]")
    parser.add_option("-n", dest="num_files", type="int", default=1,
                      help="chimera must be recurrent in N samples "
                      "to make considered a false positive "
                      "[default=%default]")
    options, args = parser.parse_args()
    input_files = args
    false_pos_chimeras = collections.defaultdict(lambda: 0)
    for input_file in input_files:
        logging.info("Processing file %s" % (input_file))
        num_chimeras = 0
        for c in Chimera.parse(open(input_file)):
            key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start)
            false_pos_chimeras[key] += 1
            num_chimeras += 1
        logging.info("\tchimeras in file: %d" % (num_chimeras))
        logging.info("\tcurrent false positive candidates: %d" % (len(false_pos_chimeras)))
    if options.output_file is None:
        fileh = sys.stdout
    else:
        fileh = open(options.output_file, "w")
    for key,recurrence in false_pos_chimeras.iteritems():
        if recurrence >= options.num_files:
            print >>fileh, '\t'.join(map(str,key))
    if options.output_file is not None:
        fileh.close()
def parse_sync_by_breakpoint(chimera_file, bam_file):
    # group reads by reference name (matches breakpoint name)
    bamfh = pysam.Samfile(bam_file, "rb")
    tid_rname_map = list(bamfh.references)
    # initialize iterator through reads
    read_iter = parse_group_by_attr(bamfh, "rname")
    read_iter_valid = True
    try:
        rname, reads = read_iter.next()
        read_breakpoint_name = tid_rname_map[rname]
    except StopIteration:
        bamfh.close()
        read_iter_valid = False
        reads = []
        read_breakpoint_name = "ZZZZZZZZZZZZZZ"
    # group chimeras by breakpoint name
    for chimera_breakpoint_name, chimeras in \
        parse_group_by_attr(Chimera.parse(open(chimera_file)),
                            "breakpoint_name"):
        while (read_iter_valid) and (chimera_breakpoint_name >
                                     read_breakpoint_name):
            try:
                rname, reads = read_iter.next()
                read_breakpoint_name = tid_rname_map[rname]
            except StopIteration:
                read_iter_valid = False
                reads = []
        if chimera_breakpoint_name < read_breakpoint_name:
            yield chimeras, []
        else:
            yield chimeras, reads
    bamfh.close()
def nominate_encomp_spanning_reads(chimera_file, output_fastq_file):
    """
    find all encompassing reads that should to be remapped to see if they
    span the breakpoint junction
    """
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    for c in Chimera.parse(open(chimera_file)):
        # find breakpoint coords of chimera
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        for r5p, r3p in c.encomp_read_pairs:
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq,
                                            "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq,
                                            "I" * len(r3p.seq))
    fqfh.close()
    return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file,
                            output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    breaks5p = collections.defaultdict(lambda: [])
    breaks3p = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(chimera_file)):
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        # keep track of all breakpoints
        breaks5p[c.partner5p.tx_name].append(end5p)
        breaks3p[c.partner5p.tx_name].append(start3p)
        for r5p, r3p in c.encomp_read_pairs:
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq,
                                            "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq,
                                            "I" * len(r3p.seq))
    # sort breakpoint positions within each gene
    for tx_name in breaks5p.keys():
        breaks5p[tx_name] = sorted(breaks5p[tx_name])
    for tx_name in breaks3p.keys():
        breaks3p[tx_name] = sorted(breaks3p[tx_name])
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        for readnum in xrange(0, 2):
            print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                    pe_reads[readnum][0].seq,
                                    pe_reads[readnum][0].qual)


#            # add unmapped reads
#            if reads[0].is_unmapped:
#                readnum = 2 if reads[0].is_read2 else 1
#                print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq,
#                                       "I" * len(reads[0].seq))
#                # TODO: remove this
#                assert len(reads) == 1
#            else:
#                remap = False
#                for r in reads:
#                    tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname)
#                    # check if this read overlaps a breakpoint
#
#                    bisect()
    bamfh.close()
    return config.JOB_SUCCESS
Пример #10
0
def get_highest_coverage_isoforms(input_file, transcripts):
    # build lookup from transcript name to cluster id
    transcript_cluster_map = dict(
        (str(t.tx_id), t.cluster_id) for t in transcripts)
    # build a lookup table to get genome coordinates from transcript
    # coordinates
    transcript_genome_map = build_transcript_genome_map(transcripts)
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        # TODO: adjust this to score chimeras differently!
        key = (c.name, c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = transcript_cluster_map[c.tx_name_5p]
        cluster3p = transcript_cluster_map[c.tx_name_3p]
        # get genomic positions of breakpoints
        coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p - 1,
                                           transcript_genome_map)
        coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p,
                                           transcript_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p, cluster3p, coord5p,
                              coord3p)].append(key)
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
def make_discordant_read_stats_file(chimera_file, output_file, isize_dist):
    f = open(output_file, "w")
    for c in Chimera.parse(open(chimera_file)):
        # get number of unique alignment positions
        num_uniquely_aligning_frags = c.get_num_unique_positions()
        # get number of unambiguous reads
        num_unambiguous_frags = c.get_num_frags(maxnumhits=1)
        # number of spanning frags
        num_spanning_frags = c.get_num_spanning_frags()             
        for dpair in c.encomp_frags:
            # get putative insert size
            isize5p = c.tx_end_5p - dpair[0].pos
            isize3p = dpair[1].pos - c.tx_start_3p
            isize = isize5p + isize3p
            isize_prob = calc_isize_prob(isize, isize_dist)
            # make ChimeraStats object
            s = ChimeraStats()
            s.qname = dpair[0].qname
            s.tid5p = dpair[0].tid
            s.pos5p = dpair[0].pos
            s.tid3p = dpair[1].tid
            s.pos3p = dpair[1].pos
            s.chimera_name = c.name
            s.num_spanning_frags = num_spanning_frags
            s.num_unambiguous_frags = num_unambiguous_frags
            s.num_uniquely_aligning_frags = num_uniquely_aligning_frags
            s.neg_mismatches = -(dpair[0].mismatches + dpair[1].mismatches)
            s.isize_prob = isize_prob
            # output to file
            print >>f, '\t'.join(map(str, s.to_list()))
    f.close()
def nominate_spanning_reads2(discordant_reads_fh, chimeras_fh, fastq_fh):
    # build index of chimera candidates
    logging.info("Indexing chimera candidates")
    tx5p = collections.defaultdict(lambda: [])
    tx3p = collections.defaultdict(lambda: [])
    for chimera in Chimera.parse(chimeras_fh):
        tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end)
        tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start)
    # parse discordant reads
    logging.info("Nominating spanning reads")
    read1, read2 = None, None
    prev_qname = None
    for frag in parse_discordant_reads(discordant_reads_fh):
        if frag.discordant_type.is_genome:
            continue
        qname = frag.qname
        if prev_qname is not None and (qname != prev_qname):
            if read1 is not None:
                print >> fastq_fh, read1
            if read2 is not None:
                print >> fastq_fh, read2
            read1, read2 = None, None
        # skip if reads already found
        if (read1 is not None) and (read2 is not None):
            continue
        # update read fastq
        r1, r2 = check_fragment(frag, tx5p, tx3p)
        if read1 is None: read1 = r1
        if read2 is None: read2 = r2
        prev_qname = qname
    if read1 is not None:
        print >> fastq_fh, read1
    if read2 is not None:
        print >> fastq_fh, read2
Пример #13
0
def make_discordant_read_stats_file(chimera_file, output_file, isize_dist):
    f = open(output_file, "w")
    for c in Chimera.parse(open(chimera_file)):
        # get number of unique alignment positions
        num_uniquely_aligning_frags = c.get_num_unique_positions()
        # get number of unambiguous reads
        num_unambiguous_frags = c.get_num_frags(maxnumhits=1)
        # number of spanning frags
        num_spanning_frags = c.get_num_spanning_frags()
        for dpair in c.encomp_frags:
            # get putative insert size
            isize5p = c.tx_end_5p - dpair[0].pos
            isize3p = dpair[1].pos - c.tx_start_3p
            isize = isize5p + isize3p
            isize_prob = calc_isize_prob(isize, isize_dist)
            # make ChimeraStats object
            s = ChimeraStats()
            s.qname = dpair[0].qname
            s.tid5p = dpair[0].tid
            s.pos5p = dpair[0].pos
            s.tid3p = dpair[1].tid
            s.pos3p = dpair[1].pos
            s.chimera_name = c.name
            s.num_spanning_frags = num_spanning_frags
            s.num_unambiguous_frags = num_unambiguous_frags
            s.num_uniquely_aligning_frags = num_uniquely_aligning_frags
            s.neg_mismatches = -(dpair[0].mismatches + dpair[1].mismatches)
            s.isize_prob = isize_prob
            # output to file
            print >> f, '\t'.join(map(str, s.to_list()))
    f.close()
Пример #14
0
def get_highest_coverage_isoforms(input_file, transcripts):
    # build lookup from transcript name to cluster id
    transcript_cluster_map = dict((str(t.tx_id),t.cluster_id) for t in transcripts)
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    transcript_genome_map = build_transcript_genome_map(transcripts)
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        # TODO: adjust this to score chimeras differently!
        key = (c.name, c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = transcript_cluster_map[c.tx_name_5p]
        cluster3p = transcript_cluster_map[c.tx_name_3p]
        # get genomic positions of breakpoints
        coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)
        coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key)    
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
def nominate_encomp_spanning_reads(chimera_file, output_fastq_file):
    """
    find all encompassing reads that should to be remapped to see if they
    span the breakpoint junction
    """
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    for c in Chimera.parse(open(chimera_file)):
        # find breakpoint coords of chimera
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        for r5p,r3p in c.encomp_read_pairs:            
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >>fqfh, to_fastq(r5p.qname, r5p.readnum, 
                                           r5p.seq, "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >>fqfh, to_fastq(r3p.qname, r3p.readnum, 
                                           r3p.seq, "I" * len(r3p.seq))
    fqfh.close()
    return config.JOB_SUCCESS
def parse_sync_by_breakpoint(chimera_file, bam_file):
    # group reads by reference name (matches breakpoint name)
    bamfh = pysam.Samfile(bam_file, "rb")
    tid_rname_map = list(bamfh.references)
    # initialize iterator through reads
    read_iter = parse_group_by_attr(bamfh, "rname")
    read_iter_valid = True
    try:
        rname, reads = read_iter.next()
        read_breakpoint_name = tid_rname_map[rname]
    except StopIteration:
        bamfh.close()
        read_iter_valid = False
        reads = []
        read_breakpoint_name = "ZZZZZZZZZZZZZZ"
    # group chimeras by breakpoint name
    for chimera_breakpoint_name, chimeras in \
        parse_group_by_attr(Chimera.parse(open(chimera_file)), 
                            "breakpoint_name"):
        while (read_iter_valid) and (chimera_breakpoint_name > read_breakpoint_name):
            try:
                rname, reads = read_iter.next()
                read_breakpoint_name = tid_rname_map[rname]
            except StopIteration:
                read_iter_valid = False
                reads = []
        if chimera_breakpoint_name < read_breakpoint_name:
            yield chimeras, []
        else:
            yield chimeras, reads    
    bamfh.close()
def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    breaks5p = collections.defaultdict(lambda: [])
    breaks3p = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(chimera_file)):
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        # keep track of all breakpoints
        breaks5p[c.partner5p.tx_name].append(end5p)
        breaks3p[c.partner5p.tx_name].append(start3p)
        for r5p, r3p in c.encomp_read_pairs:
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq))
    # sort breakpoint positions within each gene
    for tx_name in breaks5p.keys():
        breaks5p[tx_name] = sorted(breaks5p[tx_name])
    for tx_name in breaks3p.keys():
        breaks3p[tx_name] = sorted(breaks3p[tx_name])
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        for readnum in xrange(0, 2):
            print >> fqfh, to_fastq(
                pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual
            )
    #            # add unmapped reads
    #            if reads[0].is_unmapped:
    #                readnum = 2 if reads[0].is_read2 else 1
    #                print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq,
    #                                       "I" * len(reads[0].seq))
    #                # TODO: remove this
    #                assert len(reads) == 1
    #            else:
    #                remap = False
    #                for r in reads:
    #                    tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname)
    #                    # check if this read overlaps a breakpoint
    #
    #                    bisect()
    bamfh.close()
    return config.JOB_SUCCESS
Пример #18
0
def get_chimera_groups(input_file, tx_id_map):
    # group chimeras in the same genomic cluster with the same
    # breakpoint
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        # get cluster of overlapping genes
        cluster5p = tx_id_map[c.tx_name_5p].cluster_id
        cluster3p = tx_id_map[c.tx_name_3p].cluster_id
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p)].append(c)
    for key,chimeras in cluster_chimera_dict.iteritems():
        yield key,chimeras
def parse_chimeras_by_gene(chimera_file, orientation):
    clist = []
    prev_tx_name = None
    for c in Chimera.parse(open(chimera_file)):
        tx_name = c.tx_name_5p if (orientation == OrientationTags.FIVEPRIME) else c.tx_name_3p
        if prev_tx_name != tx_name:
            if len(clist) > 0:
                yield prev_tx_name, clist
                clist = []
            prev_tx_name = tx_name
        clist.append(c)
    if len(clist) > 0:
        yield prev_tx_name, clist
def filter_highest_coverage_isoforms(index_dir, input_file, output_file):
    # find highest coverage chimeras among isoforms
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    kept_chimeras = get_highest_coverage_isoforms(input_file, gene_file)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >>f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % 
                  num_filtered_chimeras)
    return config.JOB_SUCCESS
def filter_encompassing_chimeras(input_file, output_file, min_frags):
    num_chimeras = 0
    num_filtered_chimeras = 0
    f = open(output_file, "w") 
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        if c.get_num_frags() < min_frags:
            continue
        num_filtered_chimeras += 1
        print >>f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tchimeras: %d" % (num_chimeras))
    logging.debug("\tfiltered chimeras: %d" % (num_filtered_chimeras))
    return config.JOB_SUCCESS
Пример #22
0
def parse_chimeras_by_gene(chimera_file, orientation):
    clist = []
    prev_tx_name = None
    for c in Chimera.parse(open(chimera_file)):
        tx_name = c.tx_name_5p if (
            orientation == OrientationTags.FIVEPRIME) else c.tx_name_3p
        if prev_tx_name != tx_name:
            if len(clist) > 0:
                yield prev_tx_name, clist
                clist = []
            prev_tx_name = tx_name
        clist.append(c)
    if len(clist) > 0:
        yield prev_tx_name, clist
Пример #23
0
def filter_encompassing_chimeras(input_file, output_file, min_frags):
    num_chimeras = 0
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        if c.get_num_frags() < min_frags:
            continue
        num_filtered_chimeras += 1
        print >> f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tchimeras: %d" % (num_chimeras))
    logging.debug("\tfiltered chimeras: %d" % (num_filtered_chimeras))
    return config.JOB_SUCCESS
Пример #24
0
def filter_highest_coverage_isoforms(index_dir, input_file, output_file):
    # read transcripts
    logging.debug("Reading transcripts")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # find highest coverage chimeras among isoforms
    kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >> f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras)
    return config.JOB_SUCCESS
Пример #25
0
def filter_highest_coverage_isoforms(index_dir, input_file, output_file):
    # read transcripts
    logging.debug("Reading transcripts")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # find highest coverage chimeras among isoforms
    kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >>f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % 
                  num_filtered_chimeras)
    return config.JOB_SUCCESS
def read_pairs_to_chimera(chimera_name, tid5p, tid3p, readpairs, tid_tx_map,
                          genome_tx_trees, trim_bp):
    # get gene information
    tx5p = tid_tx_map[tid5p]
    tx3p = tid_tx_map[tid3p]
    # categorize chimera type
    chimera_type, distance = get_chimera_type(tx5p, tx3p, genome_tx_trees)
    # create chimera object
    c = Chimera()
    iter5p = itertools.imap(operator.itemgetter(0), readpairs)
    iter3p = itertools.imap(operator.itemgetter(1), readpairs)
    c.partner5p = ChimeraPartner.from_discordant_reads(iter5p, tx5p, trim_bp)
    c.partner3p = ChimeraPartner.from_discordant_reads(iter3p, tx3p, trim_bp)
    c.name = chimera_name
    c.chimera_type = chimera_type
    c.distance = distance
    # raw reads
    c.encomp_read_pairs = readpairs
    return c
def filter_chimeras(input_file, output_file,
                    index_dir, bam_file,
                    unique_frags,
                    isoform_fraction,
                    false_pos_file):
    logging.debug("Parameters")
    logging.debug("\tunique fragments: %f" % (unique_frags))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Loading false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Filtering chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0    
    f = open(output_file, "w")   
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_unique_frags(c, unique_frags)
        if not good:
            continue          
        false_pos_key = (c.tx_name_5p, c.tx_end_5p, 
                         c.tx_name_3p, c.tx_start_3p)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(c, isoform_fraction, bamfh)        
        if good:
            print >>f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    return config.JOB_SUCCESS
def calc_chimera_pvalues(input_file,
                         bam_file, 
                         num_mapped_reads, 
                         num_discordant_reads_within_isize_range):
    # calc discordant reads per million
    percent_discordant = num_discordant_reads_within_isize_range / float(num_mapped_reads)
    # open BAM file for checking wild-type isoforms
    bamfh = pysam.Samfile(bam_file, "rb")
    for c in Chimera.parse(open(input_file)):        
        # count 5' and 3' reads
        rname5p = config.GENE_REF_PREFIX + c.tx_name_5p
        rname3p = config.GENE_REF_PREFIX + c.tx_name_3p        
        num_reads_5p = len(set(r.qname for r in bamfh.fetch(rname5p, c.tx_start_5p, c.tx_end_5p)))
        num_reads_3p = len(set(r.qname for r in bamfh.fetch(rname3p, c.tx_start_3p, c.tx_end_3p)))
        # expected number of discordant reads
        exp_discordant_5p = num_reads_5p * percent_discordant
        exp_discordant_3p = num_reads_3p * percent_discordant
        print c.gene_name_5p, c.gene_name_3p, num_reads_5p, num_reads_3p, exp_discordant_5p, exp_discordant_3p
    bamfh.close()    
def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file,
                            breakpoint_map_file, breakpoint_fasta_file,
                            tmp_dir):
    # sort chimera file by breakpoint name
    def sortfunc(line):
        fields = line.strip().split('\t')
        return fields[Chimera.BREAKPOINT_NAME_FIELD]

    tempdirs = [tmp_dir]
    batch_sort(input=input_file,
               output=breakpoint_sorted_chimera_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=tempdirs)
    # parse and build breakpoint -> chimera map
    fastafh = open(breakpoint_fasta_file, "w")
    mapfh = open(breakpoint_map_file, "w")
    prev_breakpoint_name = None
    prev_seq = None
    chimera_names = set()
    for c in Chimera.parse(open(breakpoint_sorted_chimera_file)):
        seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p
        if c.breakpoint_name != prev_breakpoint_name:
            if len(chimera_names) > 0:
                # write to fasta
                print >> fastafh, ">%s\n%s" % (prev_breakpoint_name,
                                               split_seq(prev_seq))
                # write to map file
                print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name,
                                                prev_seq, ",".join(
                                                    sorted(chimera_names)))
                chimera_names = set()
            prev_seq = seq
            prev_breakpoint_name = c.breakpoint_name
        chimera_names.add(c.name)
    if len(chimera_names) > 0:
        print >> fastafh, ">%s\n%s" % (prev_breakpoint_name,
                                       split_seq(prev_seq))
        print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq,
                                        ",".join(chimera_names))
    fastafh.close()
    mapfh.close()
Пример #30
0
def filter_chimeras(input_file, output_file, index_dir, bam_file, unique_frags,
                    isoform_fraction, false_pos_file):
    logging.debug("Parameters")
    logging.debug("\tunique fragments: %f" % (unique_frags))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Loading false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Filtering chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_unique_frags(c, unique_frags)
        if not good:
            continue
        false_pos_key = (c.tx_name_5p, c.tx_end_5p, c.tx_name_3p,
                         c.tx_start_3p)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(
            c, isoform_fraction, bamfh)
        if good:
            print >> f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    return config.JOB_SUCCESS
def read_pairs_to_chimera(chimera_name, tid5p, tid3p, readpairs, 
                          tid_tx_map, genome_tx_trees, trim_bp):
    # get gene information
    tx5p = tid_tx_map[tid5p]
    tx3p = tid_tx_map[tid3p]
    # categorize chimera type
    chimera_type, distance = get_chimera_type(tx5p, tx3p, genome_tx_trees)
    # create chimera object
    c = Chimera()
    iter5p = itertools.imap(operator.itemgetter(0), readpairs)
    iter3p = itertools.imap(operator.itemgetter(1), readpairs)
    c.partner5p = ChimeraPartner.from_discordant_reads(iter5p, tx5p, trim_bp)
    c.partner3p = ChimeraPartner.from_discordant_reads(iter3p, tx3p, trim_bp)
    c.name = chimera_name
    c.chimera_type = chimera_type
    c.distance = distance
    # raw reads
    c.encomp_read_pairs = readpairs
    return c
Пример #32
0
def filter_chimeras(input_file, output_file,
                    filter_num_frags,
                    filter_allele_fraction,
                    mask_biotypes,
                    mask_rnames):
    logging.debug("\tfragments: %f" % (filter_num_frags))
    logging.debug("\tallele fraction: %f" % (filter_allele_fraction))
    logging.debug("\tmask biotypes: %s" % (','.join(sorted(mask_biotypes))))
    logging.debug("\tmask references: %s" % (','.join(sorted(mask_rnames))))
    # filter chimeras
    num_chimeras = 0
    num_kept_chimeras = 0    
    f = open(output_file, "w")   
    print >>f, '#' + '\t'.join(Chimera._fields)
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        # number of fragments
        if c.num_frags < filter_num_frags:
            continue
        # allele fraction
        allele_fraction_5p = float(c.num_frags) / (c.num_discordant_frags_5p + c.num_concordant_frags_5p)
        allele_fraction_3p = float(c.num_frags) / (c.num_discordant_frags_3p + c.num_concordant_frags_3p)
        allele_fraction = min(allele_fraction_5p, allele_fraction_3p)
        if allele_fraction < filter_allele_fraction:
            continue
        # masked biotypes and references
        if len(mask_biotypes.intersection(c.biotypes_5p)) > 0:
            continue
        if len(mask_biotypes.intersection(c.biotypes_3p)) > 0:
            continue
        if c.rname5p in mask_rnames:
            continue
        if c.rname3p in mask_rnames:
            continue
        print >>f, str(c)
        num_kept_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Kept chimeras: %d" % num_kept_chimeras)
    return config.JOB_SUCCESS
Пример #33
0
def filter_chimeras(input_file, output_file, filter_num_frags,
                    filter_allele_fraction, mask_biotypes, mask_rnames):
    logging.debug("\tfragments: %f" % (filter_num_frags))
    logging.debug("\tallele fraction: %f" % (filter_allele_fraction))
    logging.debug("\tmask biotypes: %s" % (','.join(sorted(mask_biotypes))))
    logging.debug("\tmask references: %s" % (','.join(sorted(mask_rnames))))
    # filter chimeras
    num_chimeras = 0
    num_kept_chimeras = 0
    f = open(output_file, "w")
    print >> f, '#' + '\t'.join(Chimera._fields)
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        # number of fragments
        if c.num_frags < filter_num_frags:
            continue
        # allele fraction
        allele_fraction_5p = float(c.num_frags) / (c.num_discordant_frags_5p +
                                                   c.num_concordant_frags_5p)
        allele_fraction_3p = float(c.num_frags) / (c.num_discordant_frags_3p +
                                                   c.num_concordant_frags_3p)
        allele_fraction = min(allele_fraction_5p, allele_fraction_3p)
        if allele_fraction < filter_allele_fraction:
            continue
        # masked biotypes and references
        if len(mask_biotypes.intersection(c.biotypes_5p)) > 0:
            continue
        if len(mask_biotypes.intersection(c.biotypes_3p)) > 0:
            continue
        if c.rname5p in mask_rnames:
            continue
        if c.rname3p in mask_rnames:
            continue
        print >> f, str(c)
        num_kept_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Kept chimeras: %d" % num_kept_chimeras)
    return config.JOB_SUCCESS
def parse_sync_chimeras_read_stats(chimera_file, read_stats_file):
    # group reads by chimera name
    read_stats_iter = group_by_attr(ChimeraStats.parse(open(read_stats_file)), 
                                    'chimera_name')
    iter_valid = True
    try:
        read_chimera_name, stats = read_stats_iter.next()
    except StopIteration:
        iter_valid = False
        stats = []
    # group chimeras by name    
    for c in Chimera.parse(open(chimera_file)):
        while (iter_valid) and (c.name > read_chimera_name):
            try:
                read_chimera_name, stats = read_stats_iter.next()
            except StopIteration:
                iter_valid = False
                stats = []
        if c.name < read_chimera_name:
            yield c, []
        else:
            yield c, stats
Пример #35
0
def parse_sync_chimeras_read_stats(chimera_file, read_stats_file):
    # group reads by chimera name
    read_stats_iter = group_by_attr(ChimeraStats.parse(open(read_stats_file)),
                                    'chimera_name')
    iter_valid = True
    try:
        read_chimera_name, stats = read_stats_iter.next()
    except StopIteration:
        iter_valid = False
        stats = []
    # group chimeras by name
    for c in Chimera.parse(open(chimera_file)):
        while (iter_valid) and (c.name > read_chimera_name):
            try:
                read_chimera_name, stats = read_stats_iter.next()
            except StopIteration:
                iter_valid = False
                stats = []
        if c.name < read_chimera_name:
            yield c, []
        else:
            yield c, stats
def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, 
                            breakpoint_map_file, breakpoint_fasta_file,
                            tmp_dir):
    # sort chimera file by breakpoint name
    def sortfunc(line):
        fields = line.strip().split('\t')
        return fields[Chimera.BREAKPOINT_NAME_FIELD]
    tempdirs = [tmp_dir]
    batch_sort(input=input_file,
               output=breakpoint_sorted_chimera_file,
               key=sortfunc,
               buffer_size=32000,
               tempdirs=tempdirs)
    # parse and build breakpoint -> chimera map
    fastafh = open(breakpoint_fasta_file, "w")
    mapfh = open(breakpoint_map_file, "w")
    prev_breakpoint_name = None
    prev_seq = None
    chimera_names = set()
    for c in Chimera.parse(open(breakpoint_sorted_chimera_file)):        
        seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p
        if c.breakpoint_name != prev_breakpoint_name:
            if len(chimera_names) > 0:
                # write to fasta
                print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq))
                # write to map file
                print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, 
                                               prev_seq, 
                                               ",".join(sorted(chimera_names)))
                chimera_names = set()
            prev_seq = seq
            prev_breakpoint_name = c.breakpoint_name
        chimera_names.add(c.name)
    if len(chimera_names) > 0:
        print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq))
        print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names))
    fastafh.close()
    mapfh.close()
def get_chimera_groups(input_file, gene_file):
    # build a lookup table to get gene clusters from transcript name    
    transcript_cluster_map = build_transcript_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    # TODO: can either group by exact breakpoint, or just by
    # gene cluster
    # transcript_genome_map = build_transcript_genome_map(open(gene_file))
    # group chimeras in the same genomic cluster with the same
    # breakpoint
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        # get cluster of overlapping genes
        cluster5p = transcript_cluster_map[c.tx_name_5p]
        cluster3p = transcript_cluster_map[c.tx_name_3p]
        # get genomic positions of breakpoints
        #coord5p = transcript_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, transcript_genome_map)
        #coord3p = transcript_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, transcript_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p)].append(c)
        # TODO: use this grouping instead?
        #cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(c)
    for key,chimeras in cluster_chimera_dict.iteritems():
        yield key,chimeras
def nominate_spanning_reads2(discordant_reads_fh, chimeras_fh, fastq_fh):
    # build index of chimera candidates
    logging.info("Indexing chimera candidates")
    tx5p = collections.defaultdict(lambda: [])
    tx3p = collections.defaultdict(lambda: [])
    for chimera in Chimera.parse(chimeras_fh):
        tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end)
        tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start)
    # parse discordant reads
    logging.info("Nominating spanning reads")
    read1, read2 = None, None
    prev_qname = None
    for frag in parse_discordant_reads(discordant_reads_fh):
        if frag.discordant_type.is_genome:
            continue
        qname = frag.qname
        if prev_qname is not None and (qname != prev_qname):
            if read1 is not None:
                print >> fastq_fh, read1
            if read2 is not None:
                print >> fastq_fh, read2
            read1, read2 = None, None
        # skip if reads already found
        if (read1 is not None) and (read2 is not None):
            continue
        # update read fastq
        r1, r2 = check_fragment(frag, tx5p, tx3p)
        if read1 is None:
            read1 = r1
        if read2 is None:
            read2 = r2
        prev_qname = qname
    if read1 is not None:
        print >> fastq_fh, read1
    if read2 is not None:
        print >> fastq_fh, read2
Пример #39
0
def make_chimera(cluster_pair, cluster_shelve, transcript_dict,
                 genome_tx_trees, annotation_source):
    # lookup 5' and 3' clusters
    cluster5p = cluster_shelve[str(cluster_pair.id5p)]
    cluster3p = cluster_shelve[str(cluster_pair.id3p)]
    # get 5' and 3' transcripts
    transcripts5p = lookup_transcripts(cluster5p, transcript_dict,
                                       genome_tx_trees)
    transcripts3p = lookup_transcripts(cluster3p, transcript_dict,
                                       genome_tx_trees)
    # lookup chimera type and distance
    chimera_type, distance = get_chimera_type(cluster5p, cluster3p,
                                              transcripts5p, transcripts3p,
                                              transcript_dict, genome_tx_trees)
    # format transcript information
    tx_names_5p, gene_names_5p, biotypes_5p = get_transcript_info(
        transcripts5p, annotation_source)
    tx_names_3p, gene_names_3p, biotypes_3p = get_transcript_info(
        transcripts3p, annotation_source)
    # make chimera object
    c = Chimera()
    c.rname5p = cluster5p.rname
    c.start5p = cluster5p.start
    c.end5p = cluster5p.end
    c.rname3p = cluster3p.rname
    c.start3p = cluster3p.start
    c.end3p = cluster3p.end
    c.chimera_id = "CHIMERA%d" % (cluster_pair.pair_id)
    frags = set(cluster_pair.qnames)
    frags.update(cluster_pair.spanning_qnames)
    c.num_frags = len(frags)
    c.strand5p = cluster5p.strand
    c.strand3p = cluster3p.strand
    c.chimera_type = chimera_type
    c.distance = distance
    c.num_discordant_frags = len(cluster_pair.qnames)
    c.num_spanning_frags = len(cluster_pair.spanning_qnames)
    c.num_discordant_frags_5p = len(cluster5p.qnames)
    c.num_discordant_frags_3p = len(cluster3p.qnames)
    c.num_concordant_frags_5p = cluster5p.concordant_frags
    c.num_concordant_frags_3p = cluster3p.concordant_frags
    c.biotypes_5p = sorted(biotypes_5p)
    c.biotypes_3p = sorted(biotypes_3p)
    c.genes_5p = sorted(gene_names_5p)
    c.genes_3p = sorted(gene_names_3p)
    c.transcripts_5p = sorted(tx_names_5p)
    c.transcripts_3p = sorted(tx_names_3p)
    return c
Пример #40
0
def filter_chimeras(input_file, output_file,
                    index_dir, bam_file,
                    weighted_unique_frags,
                    median_isize,
                    max_isize,
                    isoform_fraction,
                    false_pos_file):
    logging.debug("Filtering Parameters")
    logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags))
    logging.debug("\tmedian insert size: %d" % (median_isize))
    logging.debug("\tmax insert size allowed: %d" % (max_isize))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Parsing false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Checking chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0
    tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt")
    f = open(tmp_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_weighted_frags(c, weighted_unique_frags)
        if not good:
            continue
        good = good and filter_inner_dist(c, max_isize)
        if not good:
            continue            
        false_pos_key = (c.partner5p.tx_name, c.partner5p.end, 
                         c.partner3p.tx_name, c.partner3p.start)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(c, isoform_fraction, median_isize, bamfh)        
        if good:
            print >>f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    # find highest coverage chimeras among isoforms
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(tmp_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >>f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % 
                  num_filtered_chimeras)
    os.remove(tmp_file)
    return config.JOB_SUCCESS
Пример #41
0
def filter_homologous_genes(input_file, 
                            output_file, 
                            index_dir,
                            homolog_segment_length,
                            min_isize,
                            max_isize,
                            maxhits,
                            num_processors,
                            tmp_dir):
    logging.debug("Parameters")
    logging.debug("\thomolog segment length: %d" % (homolog_segment_length))
    logging.debug("\tmin fragment size: %d" % (min_isize))
    logging.debug("\tmax fragment size: %d" % (max_isize))
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    interval_trees_3p = collections.defaultdict(lambda: IntervalTree())
    # generate FASTA file of sequences to use in mapping
    logging.debug("Generating homologous sequences to test")
    fasta5p = os.path.join(tmp_dir, "homologous_5p.fa")    
    f = open(fasta5p, "w")
    for c in Chimera.parse(open(input_file)):
        start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length)
        # add 3' gene to interval trees
        interval_trees_3p[c.tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name))
        # extract sequence of 5' gene
        seq5p = ref_fa.fetch(c.tx_name_5p, start5p, end5p)
        for i in xrange(0, len(seq5p) - homolog_segment_length):
            print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p,
                                             start5p+i,
                                             start5p+i+homolog_segment_length,
                                             seq5p[i:i+homolog_segment_length])
    f.close()
    # map 5' sequences to reference using bowtie
    logging.debug("Mapping homologous sequences")
    bowtie2_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX)
    sam5p = os.path.join(tmp_dir, "homologous_5p.sam")
    args = [config.BOWTIE2_BIN, 
            '-p', num_processors, '--phred33',
            '--end-to-end', '--very-sensitive', '--reorder',
            '-f', '-k', maxhits,
            '-x', bowtie2_index,
            '-U', fasta5p,
            "-S", sam5p]
    retcode = subprocess.call(map(str,args))
    if retcode != 0:
        return config.JOB_ERROR
    # analyze results for homologous genes
    logging.debug("Analyzing mapping results")
    samfh = pysam.Samfile(sam5p, "r")
    tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references))
    homologous_chimeras = set()
    for r in pysam.Samfile(sam5p, "r"):
        if r.is_unmapped:
            continue
        # reference name must be in list of 3' chimeras
        rname = tid_rname_map[r.tid]        
        if rname not in interval_trees_3p:
            continue
        # get chimera name from 'qname'
        chimera_name = r.qname.split(",")[0]
        for hit in interval_trees_3p[rname].find(r.pos,r.aend):
            if hit.value == chimera_name:
                homologous_chimeras.add(chimera_name)
    # write output
    logging.debug("Writing output")
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in homologous_chimeras:
            logging.debug("Removing homologous chimera %s between %s and %s" % 
                          (c.name, c.gene_name_5p, c.gene_name_3p))
            continue
        print >>f, '\t'.join(map(str, c.to_list()))        
    f.close()
    # cleanup
    if os.path.exists(fasta5p):
        os.remove(fasta5p)
    if os.path.exists(sam5p):
        os.remove(sam5p)    
    return config.JOB_SUCCESS
Пример #42
0
def chimeras_to_breakpoints(input_file, breakpoint_map_file, breakpoint_fasta_file):
    # now extract the unique junction sequences
    # and write them to a fasta file
    breakpointfh = open(breakpoint_map_file, "w")
    fasta_output_fh = open(breakpoint_fasta_file, "w")

    for c in Chimera.parse(open(input_file)):
        

    for seq,b in breakpoints.iteritems():
        # write to fasta file
        print >>fasta_output_fh, ">%s\n%s" % (b.name, seq)
        # write to breakpoint map file
        fields = b.to_list()
        print >>breakpointfh, '\t'.join(map(str, fields))
    # close files
    fasta_output_fh.close()
    breakpointfh.close()
    chimerafh.close()
    

def main():
    from optparse import OptionParser
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <chimeras.bedpe> "
                          "<breakpoints.txt> <breakpoints.fa") 
    options, args = parser.parse_args()
    input_file = args[0]
    breakpoint_map_file = args[1]
    breakpoint_fasta_file = args[2]
    chimeras_to_breakpoints(input_file, breakpoint_map_file, breakpoint_fasta_file)

def main():
    from optparse import OptionParser
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <index> <read_length> "
                          "<chimeras.txt> <chimeras.out.txt> " 
                          "<breakpoints.txt> <breakpoints.fa>")
    parser.add_option("--homology-mismatches", type="int", 
                      dest="homology_mismatches", 
                      default=config.BREAKPOINT_HOMOLOGY_MISMATCHES,
                      help="Number of mismatches to tolerate when computing "
                      "homology between gene and its chimeric partner "
                      "[default=%default]")
    options, args = parser.parse_args()
    index_dir = args[0]
    read_length = int(args[1])
    input_chimera_file = args[2]
    output_chimera_file = args[3]
    breakpoint_map_file = args[4]
    breakpoint_fasta_file = args[5]
    determine_chimera_breakpoints(index_dir, 
                                  read_length, 
                                  input_chimera_file, 
                                  output_chimera_file, 
                                  breakpoint_map_file, 
                                  breakpoint_fasta_file,
                                  homology_mismatches=options.homology_mismatches)

if __name__ == '__main__':
    main()
Пример #43
0
def determine_chimera_breakpoints(index_dir, read_length, 
                                  input_chimera_file, output_chimera_file, 
                                  breakpoint_map_file, breakpoint_fasta_file,
                                  homology_mismatches=DEFAULT_HOMOLOGY_MISMATCHES):
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # output files
    chimerafh = open(output_chimera_file, "w")
    breakpointfh = open(breakpoint_map_file, "w")
    fasta_output_fh = open(breakpoint_fasta_file, "w")
    breakpoints = collections.defaultdict(lambda: [])
    breaknum = 0
    for c in Chimera.parse(open(input_chimera_file)):
        # retrieve transcript coordinates of 5' and 3' partners
        ref5p = config.GENE_REF_PREFIX + c.partner5p.tx_name
        ref3p = config.GENE_REF_PREFIX + c.partner3p.tx_name
        start5p, end5p = c.partner5p.start, c.partner5p.end
        start3p, end3p = c.partner3p.start, c.partner3p.end
        # get intervals for breakpoint sequence
        breakpoint_start5p = max(start5p, end5p - read_length + 1)
        breakpoint_end3p = min(end3p, start3p + read_length - 1)
        # fetch sequence
        seq5p = ref_fa.fetch(ref5p, breakpoint_start5p, end5p)
        seq3p = ref_fa.fetch(ref3p, start3p, breakpoint_end3p)
        if len(seq5p) < read_length - 1:
            logging.warning("Could not extract sequence of length >%d from "
                            "5' partner of chimera %s, only retrieved "
                            "sequence of %d" % 
                            (read_length-1, c.name, len(seq5p)))
            # pad sequence
            padding = (read_length - 1) - len(seq5p)
            seq5p = ("N" * padding) + seq5p
        if len(seq3p) < read_length - 1:
            logging.warning("Could not extract sequence of length >%d from "
                            "3' partner of chimera %s, only retrieved "
                            "sequence of %d" % 
                            (read_length-1, c.name, len(seq3p)))
            # pad sequence
            padding = (read_length - 1) - len(seq3p)
            seq3p = seq3p + ("N" * padding)
        # fetch continuation sequence of non-fusion gene
        homolog_end5p = end5p + read_length - 1
        homolog_start3p = max(0, start3p - read_length + 1)
        homolog5p = ref_fa.fetch(ref3p, homolog_start3p, start3p)
        homolog3p = ref_fa.fetch(ref5p, end5p, homolog_end5p)
        # find homology between 5' gene and 3' gene
        homology_length_5p = calc_homology(seq5p[::-1], homolog5p[::-1], 
                                           homology_mismatches)
        homology_length_3p = calc_homology(seq3p, homolog3p, 
                                           homology_mismatches)        
        # create a Breakpoint and add to dictionary
        seq = seq5p + seq3p
        if seq in breakpoints:
            b = breakpoints[seq]
        else:
            b = Breakpoint()
            b.name = "B%07d" % (breaknum)
            breaknum += 1
            b.seq5p = seq5p
            b.seq3p = seq3p
            breakpoints[seq] = b
        # add sequence to dictionary and group fusion candidates together
        # if they have the same location and junction sequence
        b.chimera_names.append(c.name)
        # update Chimera object with breakpoint information
        c.breakpoint_name = b.name
        c.breakpoint_homology_5p = homology_length_5p
        c.breakpoint_homology_3p = homology_length_3p
        # write Chimera
        fields = c.to_list()
        print >>chimerafh, '\t'.join(map(str, c.to_list()))
    # now extract the unique junction sequences
    # and write them to a fasta file
    for seq,b in breakpoints.iteritems():
        # write to fasta file
        print >>fasta_output_fh, ">%s\n%s" % (b.name, seq)
        # write to breakpoint map file
        fields = b.to_list()
        print >>breakpointfh, '\t'.join(map(str, fields))
    # close files
    fasta_output_fh.close()
    breakpointfh.close()
    chimerafh.close()
Пример #44
0
def make_chimera(cluster_pair, 
                 cluster_shelve,
                 transcript_dict,
                 genome_tx_trees,
                 annotation_source):
    # lookup 5' and 3' clusters
    cluster5p = cluster_shelve[str(cluster_pair.id5p)]
    cluster3p = cluster_shelve[str(cluster_pair.id3p)]
    # get 5' and 3' transcripts
    transcripts5p = lookup_transcripts(cluster5p, transcript_dict, genome_tx_trees)
    transcripts3p = lookup_transcripts(cluster3p, transcript_dict, genome_tx_trees)
    # lookup chimera type and distance
    chimera_type, distance = get_chimera_type(cluster5p, cluster3p, 
                                              transcripts5p, transcripts3p, 
                                              transcript_dict, genome_tx_trees)
    # format transcript information
    tx_names_5p, gene_names_5p, biotypes_5p = get_transcript_info(transcripts5p, annotation_source)
    tx_names_3p, gene_names_3p, biotypes_3p = get_transcript_info(transcripts3p, annotation_source)
    # make chimera object
    c = Chimera()
    c.rname5p = cluster5p.rname
    c.start5p = cluster5p.start
    c.end5p = cluster5p.end
    c.rname3p = cluster3p.rname
    c.start3p = cluster3p.start
    c.end3p = cluster3p.end
    c.chimera_id = "CHIMERA%d" % (cluster_pair.pair_id)
    frags = set(cluster_pair.qnames)
    frags.update(cluster_pair.spanning_qnames)
    c.num_frags = len(frags)
    c.strand5p = cluster5p.strand
    c.strand3p = cluster3p.strand
    c.chimera_type = chimera_type
    c.distance = distance
    c.num_discordant_frags = len(cluster_pair.qnames)
    c.num_spanning_frags = len(cluster_pair.spanning_qnames)
    c.num_discordant_frags_5p = len(cluster5p.qnames)
    c.num_discordant_frags_3p = len(cluster3p.qnames)
    c.num_concordant_frags_5p = cluster5p.concordant_frags
    c.num_concordant_frags_3p = cluster3p.concordant_frags
    c.biotypes_5p = sorted(biotypes_5p)
    c.biotypes_3p = sorted(biotypes_3p)
    c.genes_5p = sorted(gene_names_5p)
    c.genes_3p = sorted(gene_names_3p)
    c.transcripts_5p = sorted(tx_names_5p)
    c.transcripts_3p = sorted(tx_names_3p)
    return c
def filter_homologous_genes(input_file, output_file, index_dir,
                            homolog_segment_length,
                            min_isize,
                            max_isize,
                            bowtie_bin,
                            num_processors,
                            tmp_dir):
    logging.debug("Parameters")
    logging.debug("\thomolog segment length: %d" % (homolog_segment_length))
    logging.debug("\tmin fragment size: %d" % (min_isize))
    logging.debug("\tmax fragment size: %d" % (max_isize))

    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    bowtie_index = os.path.join(index_dir, config.ALIGN_INDEX)
    interval_trees_3p = collections.defaultdict(lambda: IntervalTree())

    # generate FASTA file of sequences to use in mapping
    logging.debug("Generating homologous sequences to test")
    fasta5p = os.path.join(tmp_dir, "homologous_5p.fa")    
    f = open(fasta5p, "w")
    for c in Chimera.parse(open(input_file)):
        tx_name_5p = config.GENE_REF_PREFIX + c.tx_name_5p
        tx_name_3p = config.GENE_REF_PREFIX + c.tx_name_3p
        start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length)
        # add 3' gene to interval trees
        interval_trees_3p[tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name))
        # extract sequence of 5' gene
        seq5p = ref_fa.fetch(tx_name_5p, start5p, end5p)
        for i in xrange(0, len(seq5p) - homolog_segment_length):
            print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p,
                                             start5p+i,
                                             start5p+i+homolog_segment_length,
                                             seq5p[i:i+homolog_segment_length])
    f.close()
    
    # map 5' sequences to reference using bowtie
    logging.debug("Mapping homologous sequences")
    sam5p = os.path.join(tmp_dir, "homologous_5p.sam")
    args = [bowtie_bin, "-p", num_processors, "-f", "-a", "-m", 100, 
            "-y", "-v", 3, "-S",
            bowtie_index, fasta5p, sam5p]   
    retcode = subprocess.call(map(str,args))
    if retcode != 0:
        return config.JOB_ERROR

    # analyze results for homologous genes
    logging.debug("Analyzing mapping results")
    samfh = pysam.Samfile(sam5p, "r")
    tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references))
    homologous_chimeras = set()
    for r in pysam.Samfile(sam5p, "r"):
        if r.is_unmapped:
            continue
        # reference name must be in list of 3' chimeras
        rname = tid_rname_map[r.rname]        
        if rname not in interval_trees_3p:
            continue
        # get chimera name from 'qname'
        chimera_name = r.qname.split(",")[0]
        for hit in interval_trees_3p[rname].find(r.pos,r.aend):
            if hit.value == chimera_name:
                homologous_chimeras.add(chimera_name)

    # write output
    logging.debug("Writing output")
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in homologous_chimeras:
            logging.debug("Removing homologous chimera %s between %s and %s" % 
                          (c.name, c.gene_name_5p, c.gene_name_3p))
            continue
        print >>f, '\t'.join(map(str, c.to_list()))        
    f.close()
    
    # cleanup
    if os.path.exists(fasta5p):
        os.remove(fasta5p)
    if os.path.exists(sam5p):
        os.remove(sam5p)    
    return config.JOB_SUCCESS