def filter_chimeras(input_file, output_file, index_dir, bam_file, weighted_unique_frags, median_isize, max_isize, isoform_fraction, false_pos_file): logging.debug("Filtering Parameters") logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags)) logging.debug("\tmedian insert size: %d" % (median_isize)) logging.debug("\tmax insert size allowed: %d" % (max_isize)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Parsing false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Checking chimeras") num_chimeras = 0 num_filtered_chimeras = 0 tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt") f = open(tmp_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_weighted_frags(c, weighted_unique_frags) if not good: continue good = good and filter_inner_dist(c, max_isize) if not good: continue false_pos_key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction( c, isoform_fraction, median_isize, bamfh) if good: print >> f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() # find highest coverage chimeras among isoforms gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(tmp_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >> f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) os.remove(tmp_file) return config.JOB_SUCCESS
def resolve_chimeric_reads(input_file, output_file, isize_dist): # gather statistics on read alignments and chimeras # that will be used to associate reads with chimeras read_chimera_dict = collections.defaultdict(lambda: []) chimera_stats_dict = {} for c in Chimera.parse(open(input_file)): # combine multimap hists mmap_hist = merge_mmap_hists(c.partner5p.multimap_hist, c.partner3p.multimap_hist) chimera_stats_dict[c.name] = (c.get_total_unique_reads(), c.get_unique_spanning_reads(), mmap_hist) # get statistics for reads for dpair in c.encomp_read_pairs: qname = dpair[0].qname # find insert size probability of read isize_prob = calc_isize_prob(dpair, c, isize_dist) # find number of mismatches mismatches = dpair[0].mismatches + dpair[1].mismatches read_chimera_dict[qname].append((c.name, isize_prob, mismatches)) # now process one read at a time, looking at all its alignments and # choose the most likely set of alignments encomp_qnames_dict = collections.defaultdict(lambda: []) for qname, read_stats_list in read_chimera_dict.iteritems(): stats_chimera_dict = collections.defaultdict(lambda: []) for chimera_name, isize_prob, mismatches in read_stats_list: # get chimera stats unique_reads, spanning_reads, mmap_hist = chimera_stats_dict[ chimera_name] # make a key to sort on key = (unique_reads, spanning_reads) + tuple(mmap_hist) + ( isize_prob, -mismatches) stats_chimera_dict[key].append(chimera_name) # sort keys (reverse) sorted_stats_keys = sorted(stats_chimera_dict.keys(), reverse=True) # use only the best key chimera_names = stats_chimera_dict[sorted_stats_keys[0]] for chimera_name in chimera_names: encomp_qnames_dict[chimera_name].add(qname) # now edit the chimeras using the modified qnames f = open(output_file, "w") for c in Chimera.parse(open(input_file)): qnames = encomp_qnames_dict[c.name] filtered_encomp_pairs = [] for pair in c.encomp_read_pairs: if pair[0].qname not in qnames: continue filtered_encomp_pairs.append(pair) # update encompassing reads c.encomp_read_pairs = filtered_encomp_pairs f.close()
def resolve_chimeric_reads(input_file, output_file, isize_dist): # gather statistics on read alignments and chimeras # that will be used to associate reads with chimeras read_chimera_dict = collections.defaultdict(lambda: []) chimera_stats_dict = {} for c in Chimera.parse(open(input_file)): # combine multimap hists mmap_hist = merge_mmap_hists(c.partner5p.multimap_hist, c.partner3p.multimap_hist) chimera_stats_dict[c.name] = (c.get_total_unique_reads(), c.get_unique_spanning_reads(), mmap_hist) # get statistics for reads for dpair in c.encomp_read_pairs: qname = dpair[0].qname # find insert size probability of read isize_prob = calc_isize_prob(dpair, c, isize_dist) # find number of mismatches mismatches = dpair[0].mismatches + dpair[1].mismatches read_chimera_dict[qname].append((c.name, isize_prob, mismatches)) # now process one read at a time, looking at all its alignments and # choose the most likely set of alignments encomp_qnames_dict = collections.defaultdict(lambda: []) for qname, read_stats_list in read_chimera_dict.iteritems(): stats_chimera_dict = collections.defaultdict(lambda: []) for chimera_name, isize_prob, mismatches in read_stats_list: # get chimera stats unique_reads, spanning_reads, mmap_hist = chimera_stats_dict[chimera_name] # make a key to sort on key = (unique_reads, spanning_reads) + tuple(mmap_hist) + (isize_prob, -mismatches) stats_chimera_dict[key].append(chimera_name) # sort keys (reverse) sorted_stats_keys = sorted(stats_chimera_dict.keys(), reverse=True) # use only the best key chimera_names = stats_chimera_dict[sorted_stats_keys[0]] for chimera_name in chimera_names: encomp_qnames_dict[chimera_name].add(qname) # now edit the chimeras using the modified qnames f = open(output_file, "w") for c in Chimera.parse(open(input_file)): qnames = encomp_qnames_dict[c.name] filtered_encomp_pairs = [] for pair in c.encomp_read_pairs: if pair[0].qname not in qnames: continue filtered_encomp_pairs.append(pair) # update encompassing reads c.encomp_read_pairs = filtered_encomp_pairs f.close()
def get_highest_coverage_isoforms(input_file, gene_file): # place overlapping chimeras into clusters logging.debug("Building isoform cluster lookup table") tx_cluster_map = build_tx_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates tx_genome_map = build_gene_to_genome_map(open(gene_file)) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): key = (c.name, c.get_num_unique_spanning_positions(), c.get_weighted_cov(), c.get_num_frags()) # get cluster of overlapping genes cluster5p = tx_cluster_map[c.partner5p.tx_name] cluster3p = tx_cluster_map[c.partner3p.tx_name] # get genomic positions of breakpoints coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, tx_genome_map) coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, tx_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def get_highest_coverage_isoforms(input_file, gene_file): # place overlapping chimeras into clusters logging.debug("Building isoform cluster lookup table") tx_cluster_map = build_tx_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates tx_genome_map = build_gene_to_genome_map(open(gene_file)) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): key = (c.name, c.get_num_unique_spanning_positions(), c.get_weighted_cov(), c.get_num_frags()) # get cluster of overlapping genes cluster5p = tx_cluster_map[c.partner5p.tx_name] cluster3p = tx_cluster_map[c.partner3p.tx_name] # get genomic positions of breakpoints coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end - 1, tx_genome_map) coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, tx_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p, cluster3p, coord5p, coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def main(): from optparse import OptionParser logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <chimeras.txt> [<chimeras2.txt> <chimeras3.txt> ...]") parser.add_option("-o", dest="output_file", default=None, help="output file [default=stdout]") parser.add_option("-n", dest="num_files", type="int", default=1, help="chimera must be recurrent in N samples " "to make considered a false positive " "[default=%default]") options, args = parser.parse_args() input_files = args false_pos_chimeras = collections.defaultdict(lambda: 0) for input_file in input_files: logging.info("Processing file %s" % (input_file)) num_chimeras = 0 for c in Chimera.parse(open(input_file)): key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) false_pos_chimeras[key] += 1 num_chimeras += 1 logging.info("\tchimeras in file: %d" % (num_chimeras)) logging.info("\tcurrent false positive candidates: %d" % (len(false_pos_chimeras))) if options.output_file is None: fileh = sys.stdout else: fileh = open(options.output_file, "w") for key,recurrence in false_pos_chimeras.iteritems(): if recurrence >= options.num_files: print >>fileh, '\t'.join(map(str,key)) if options.output_file is not None: fileh.close()
def parse_sync_by_breakpoint(chimera_file, bam_file): # group reads by reference name (matches breakpoint name) bamfh = pysam.Samfile(bam_file, "rb") tid_rname_map = list(bamfh.references) # initialize iterator through reads read_iter = parse_group_by_attr(bamfh, "rname") read_iter_valid = True try: rname, reads = read_iter.next() read_breakpoint_name = tid_rname_map[rname] except StopIteration: bamfh.close() read_iter_valid = False reads = [] read_breakpoint_name = "ZZZZZZZZZZZZZZ" # group chimeras by breakpoint name for chimera_breakpoint_name, chimeras in \ parse_group_by_attr(Chimera.parse(open(chimera_file)), "breakpoint_name"): while (read_iter_valid) and (chimera_breakpoint_name > read_breakpoint_name): try: rname, reads = read_iter.next() read_breakpoint_name = tid_rname_map[rname] except StopIteration: read_iter_valid = False reads = [] if chimera_breakpoint_name < read_breakpoint_name: yield chimeras, [] else: yield chimeras, reads bamfh.close()
def nominate_encomp_spanning_reads(chimera_file, output_fastq_file): """ find all encompassing reads that should to be remapped to see if they span the breakpoint junction """ fqfh = open(output_fastq_file, "w") remap_qnames = set() for c in Chimera.parse(open(chimera_file)): # find breakpoint coords of chimera end5p = c.partner5p.end start3p = c.partner3p.start for r5p, r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) fqfh.close() return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") remap_qnames = set() breaks5p = collections.defaultdict(lambda: []) breaks3p = collections.defaultdict(lambda: []) for c in Chimera.parse(open(chimera_file)): end5p = c.partner5p.end start3p = c.partner3p.start # keep track of all breakpoints breaks5p[c.partner5p.tx_name].append(end5p) breaks3p[c.partner5p.tx_name].append(start3p) for r5p, r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) # sort breakpoint positions within each gene for tx_name in breaks5p.keys(): breaks5p[tx_name] = sorted(breaks5p[tx_name]) for tx_name in breaks3p.keys(): breaks3p[tx_name] = sorted(breaks3p[tx_name]) # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): for readnum in xrange(0, 2): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) # # add unmapped reads # if reads[0].is_unmapped: # readnum = 2 if reads[0].is_read2 else 1 # print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq, # "I" * len(reads[0].seq)) # # TODO: remove this # assert len(reads) == 1 # else: # remap = False # for r in reads: # tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname) # # check if this read overlaps a breakpoint # # bisect() bamfh.close() return config.JOB_SUCCESS
def get_highest_coverage_isoforms(input_file, transcripts): # build lookup from transcript name to cluster id transcript_cluster_map = dict( (str(t.tx_id), t.cluster_id) for t in transcripts) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(transcripts) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # TODO: adjust this to score chimeras differently! key = (c.name, c.get_num_frags()) # get cluster of overlapping genes cluster5p = transcript_cluster_map[c.tx_name_5p] cluster3p = transcript_cluster_map[c.tx_name_3p] # get genomic positions of breakpoints coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p - 1, transcript_genome_map) coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p, cluster3p, coord5p, coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def make_discordant_read_stats_file(chimera_file, output_file, isize_dist): f = open(output_file, "w") for c in Chimera.parse(open(chimera_file)): # get number of unique alignment positions num_uniquely_aligning_frags = c.get_num_unique_positions() # get number of unambiguous reads num_unambiguous_frags = c.get_num_frags(maxnumhits=1) # number of spanning frags num_spanning_frags = c.get_num_spanning_frags() for dpair in c.encomp_frags: # get putative insert size isize5p = c.tx_end_5p - dpair[0].pos isize3p = dpair[1].pos - c.tx_start_3p isize = isize5p + isize3p isize_prob = calc_isize_prob(isize, isize_dist) # make ChimeraStats object s = ChimeraStats() s.qname = dpair[0].qname s.tid5p = dpair[0].tid s.pos5p = dpair[0].pos s.tid3p = dpair[1].tid s.pos3p = dpair[1].pos s.chimera_name = c.name s.num_spanning_frags = num_spanning_frags s.num_unambiguous_frags = num_unambiguous_frags s.num_uniquely_aligning_frags = num_uniquely_aligning_frags s.neg_mismatches = -(dpair[0].mismatches + dpair[1].mismatches) s.isize_prob = isize_prob # output to file print >>f, '\t'.join(map(str, s.to_list())) f.close()
def nominate_spanning_reads2(discordant_reads_fh, chimeras_fh, fastq_fh): # build index of chimera candidates logging.info("Indexing chimera candidates") tx5p = collections.defaultdict(lambda: []) tx3p = collections.defaultdict(lambda: []) for chimera in Chimera.parse(chimeras_fh): tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end) tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start) # parse discordant reads logging.info("Nominating spanning reads") read1, read2 = None, None prev_qname = None for frag in parse_discordant_reads(discordant_reads_fh): if frag.discordant_type.is_genome: continue qname = frag.qname if prev_qname is not None and (qname != prev_qname): if read1 is not None: print >> fastq_fh, read1 if read2 is not None: print >> fastq_fh, read2 read1, read2 = None, None # skip if reads already found if (read1 is not None) and (read2 is not None): continue # update read fastq r1, r2 = check_fragment(frag, tx5p, tx3p) if read1 is None: read1 = r1 if read2 is None: read2 = r2 prev_qname = qname if read1 is not None: print >> fastq_fh, read1 if read2 is not None: print >> fastq_fh, read2
def make_discordant_read_stats_file(chimera_file, output_file, isize_dist): f = open(output_file, "w") for c in Chimera.parse(open(chimera_file)): # get number of unique alignment positions num_uniquely_aligning_frags = c.get_num_unique_positions() # get number of unambiguous reads num_unambiguous_frags = c.get_num_frags(maxnumhits=1) # number of spanning frags num_spanning_frags = c.get_num_spanning_frags() for dpair in c.encomp_frags: # get putative insert size isize5p = c.tx_end_5p - dpair[0].pos isize3p = dpair[1].pos - c.tx_start_3p isize = isize5p + isize3p isize_prob = calc_isize_prob(isize, isize_dist) # make ChimeraStats object s = ChimeraStats() s.qname = dpair[0].qname s.tid5p = dpair[0].tid s.pos5p = dpair[0].pos s.tid3p = dpair[1].tid s.pos3p = dpair[1].pos s.chimera_name = c.name s.num_spanning_frags = num_spanning_frags s.num_unambiguous_frags = num_unambiguous_frags s.num_uniquely_aligning_frags = num_uniquely_aligning_frags s.neg_mismatches = -(dpair[0].mismatches + dpair[1].mismatches) s.isize_prob = isize_prob # output to file print >> f, '\t'.join(map(str, s.to_list())) f.close()
def get_highest_coverage_isoforms(input_file, transcripts): # build lookup from transcript name to cluster id transcript_cluster_map = dict((str(t.tx_id),t.cluster_id) for t in transcripts) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(transcripts) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # TODO: adjust this to score chimeras differently! key = (c.name, c.get_num_frags()) # get cluster of overlapping genes cluster5p = transcript_cluster_map[c.tx_name_5p] cluster3p = transcript_cluster_map[c.tx_name_3p] # get genomic positions of breakpoints coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map) coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def nominate_encomp_spanning_reads(chimera_file, output_fastq_file): """ find all encompassing reads that should to be remapped to see if they span the breakpoint junction """ fqfh = open(output_fastq_file, "w") remap_qnames = set() for c in Chimera.parse(open(chimera_file)): # find breakpoint coords of chimera end5p = c.partner5p.end start3p = c.partner3p.start for r5p,r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >>fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >>fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) fqfh.close() return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") remap_qnames = set() breaks5p = collections.defaultdict(lambda: []) breaks3p = collections.defaultdict(lambda: []) for c in Chimera.parse(open(chimera_file)): end5p = c.partner5p.end start3p = c.partner3p.start # keep track of all breakpoints breaks5p[c.partner5p.tx_name].append(end5p) breaks3p[c.partner5p.tx_name].append(start3p) for r5p, r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) # sort breakpoint positions within each gene for tx_name in breaks5p.keys(): breaks5p[tx_name] = sorted(breaks5p[tx_name]) for tx_name in breaks3p.keys(): breaks3p[tx_name] = sorted(breaks3p[tx_name]) # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): for readnum in xrange(0, 2): print >> fqfh, to_fastq( pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual ) # # add unmapped reads # if reads[0].is_unmapped: # readnum = 2 if reads[0].is_read2 else 1 # print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq, # "I" * len(reads[0].seq)) # # TODO: remove this # assert len(reads) == 1 # else: # remap = False # for r in reads: # tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname) # # check if this read overlaps a breakpoint # # bisect() bamfh.close() return config.JOB_SUCCESS
def get_chimera_groups(input_file, tx_id_map): # group chimeras in the same genomic cluster with the same # breakpoint cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # get cluster of overlapping genes cluster5p = tx_id_map[c.tx_name_5p].cluster_id cluster3p = tx_id_map[c.tx_name_3p].cluster_id # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p)].append(c) for key,chimeras in cluster_chimera_dict.iteritems(): yield key,chimeras
def parse_chimeras_by_gene(chimera_file, orientation): clist = [] prev_tx_name = None for c in Chimera.parse(open(chimera_file)): tx_name = c.tx_name_5p if (orientation == OrientationTags.FIVEPRIME) else c.tx_name_3p if prev_tx_name != tx_name: if len(clist) > 0: yield prev_tx_name, clist clist = [] prev_tx_name = tx_name clist.append(c) if len(clist) > 0: yield prev_tx_name, clist
def filter_highest_coverage_isoforms(index_dir, input_file, output_file): # find highest coverage chimeras among isoforms gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) kept_chimeras = get_highest_coverage_isoforms(input_file, gene_file) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >>f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) return config.JOB_SUCCESS
def filter_encompassing_chimeras(input_file, output_file, min_frags): num_chimeras = 0 num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 if c.get_num_frags() < min_frags: continue num_filtered_chimeras += 1 print >>f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tchimeras: %d" % (num_chimeras)) logging.debug("\tfiltered chimeras: %d" % (num_filtered_chimeras)) return config.JOB_SUCCESS
def parse_chimeras_by_gene(chimera_file, orientation): clist = [] prev_tx_name = None for c in Chimera.parse(open(chimera_file)): tx_name = c.tx_name_5p if ( orientation == OrientationTags.FIVEPRIME) else c.tx_name_3p if prev_tx_name != tx_name: if len(clist) > 0: yield prev_tx_name, clist clist = [] prev_tx_name = tx_name clist.append(c) if len(clist) > 0: yield prev_tx_name, clist
def filter_encompassing_chimeras(input_file, output_file, min_frags): num_chimeras = 0 num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 if c.get_num_frags() < min_frags: continue num_filtered_chimeras += 1 print >> f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tchimeras: %d" % (num_chimeras)) logging.debug("\tfiltered chimeras: %d" % (num_filtered_chimeras)) return config.JOB_SUCCESS
def filter_highest_coverage_isoforms(index_dir, input_file, output_file): # read transcripts logging.debug("Reading transcripts") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) # find highest coverage chimeras among isoforms kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >> f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) return config.JOB_SUCCESS
def filter_highest_coverage_isoforms(index_dir, input_file, output_file): # read transcripts logging.debug("Reading transcripts") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) # find highest coverage chimeras among isoforms kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >>f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) return config.JOB_SUCCESS
def read_pairs_to_chimera(chimera_name, tid5p, tid3p, readpairs, tid_tx_map, genome_tx_trees, trim_bp): # get gene information tx5p = tid_tx_map[tid5p] tx3p = tid_tx_map[tid3p] # categorize chimera type chimera_type, distance = get_chimera_type(tx5p, tx3p, genome_tx_trees) # create chimera object c = Chimera() iter5p = itertools.imap(operator.itemgetter(0), readpairs) iter3p = itertools.imap(operator.itemgetter(1), readpairs) c.partner5p = ChimeraPartner.from_discordant_reads(iter5p, tx5p, trim_bp) c.partner3p = ChimeraPartner.from_discordant_reads(iter3p, tx3p, trim_bp) c.name = chimera_name c.chimera_type = chimera_type c.distance = distance # raw reads c.encomp_read_pairs = readpairs return c
def filter_chimeras(input_file, output_file, index_dir, bam_file, unique_frags, isoform_fraction, false_pos_file): logging.debug("Parameters") logging.debug("\tunique fragments: %f" % (unique_frags)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Loading false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Filtering chimeras") num_chimeras = 0 num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_unique_frags(c, unique_frags) if not good: continue false_pos_key = (c.tx_name_5p, c.tx_end_5p, c.tx_name_3p, c.tx_start_3p) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction(c, isoform_fraction, bamfh) if good: print >>f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() return config.JOB_SUCCESS
def calc_chimera_pvalues(input_file, bam_file, num_mapped_reads, num_discordant_reads_within_isize_range): # calc discordant reads per million percent_discordant = num_discordant_reads_within_isize_range / float(num_mapped_reads) # open BAM file for checking wild-type isoforms bamfh = pysam.Samfile(bam_file, "rb") for c in Chimera.parse(open(input_file)): # count 5' and 3' reads rname5p = config.GENE_REF_PREFIX + c.tx_name_5p rname3p = config.GENE_REF_PREFIX + c.tx_name_3p num_reads_5p = len(set(r.qname for r in bamfh.fetch(rname5p, c.tx_start_5p, c.tx_end_5p))) num_reads_3p = len(set(r.qname for r in bamfh.fetch(rname3p, c.tx_start_3p, c.tx_end_3p))) # expected number of discordant reads exp_discordant_5p = num_reads_5p * percent_discordant exp_discordant_3p = num_reads_3p * percent_discordant print c.gene_name_5p, c.gene_name_3p, num_reads_5p, num_reads_3p, exp_discordant_5p, exp_discordant_3p bamfh.close()
def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, breakpoint_map_file, breakpoint_fasta_file, tmp_dir): # sort chimera file by breakpoint name def sortfunc(line): fields = line.strip().split('\t') return fields[Chimera.BREAKPOINT_NAME_FIELD] tempdirs = [tmp_dir] batch_sort(input=input_file, output=breakpoint_sorted_chimera_file, key=sortfunc, buffer_size=32000, tempdirs=tempdirs) # parse and build breakpoint -> chimera map fastafh = open(breakpoint_fasta_file, "w") mapfh = open(breakpoint_map_file, "w") prev_breakpoint_name = None prev_seq = None chimera_names = set() for c in Chimera.parse(open(breakpoint_sorted_chimera_file)): seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p if c.breakpoint_name != prev_breakpoint_name: if len(chimera_names) > 0: # write to fasta print >> fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) # write to map file print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join( sorted(chimera_names))) chimera_names = set() prev_seq = seq prev_breakpoint_name = c.breakpoint_name chimera_names.add(c.name) if len(chimera_names) > 0: print >> fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) print >> mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names)) fastafh.close() mapfh.close()
def filter_chimeras(input_file, output_file, index_dir, bam_file, unique_frags, isoform_fraction, false_pos_file): logging.debug("Parameters") logging.debug("\tunique fragments: %f" % (unique_frags)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Loading false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Filtering chimeras") num_chimeras = 0 num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_unique_frags(c, unique_frags) if not good: continue false_pos_key = (c.tx_name_5p, c.tx_end_5p, c.tx_name_3p, c.tx_start_3p) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction( c, isoform_fraction, bamfh) if good: print >> f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() return config.JOB_SUCCESS
def filter_chimeras(input_file, output_file, filter_num_frags, filter_allele_fraction, mask_biotypes, mask_rnames): logging.debug("\tfragments: %f" % (filter_num_frags)) logging.debug("\tallele fraction: %f" % (filter_allele_fraction)) logging.debug("\tmask biotypes: %s" % (','.join(sorted(mask_biotypes)))) logging.debug("\tmask references: %s" % (','.join(sorted(mask_rnames)))) # filter chimeras num_chimeras = 0 num_kept_chimeras = 0 f = open(output_file, "w") print >>f, '#' + '\t'.join(Chimera._fields) for c in Chimera.parse(open(input_file)): num_chimeras += 1 # number of fragments if c.num_frags < filter_num_frags: continue # allele fraction allele_fraction_5p = float(c.num_frags) / (c.num_discordant_frags_5p + c.num_concordant_frags_5p) allele_fraction_3p = float(c.num_frags) / (c.num_discordant_frags_3p + c.num_concordant_frags_3p) allele_fraction = min(allele_fraction_5p, allele_fraction_3p) if allele_fraction < filter_allele_fraction: continue # masked biotypes and references if len(mask_biotypes.intersection(c.biotypes_5p)) > 0: continue if len(mask_biotypes.intersection(c.biotypes_3p)) > 0: continue if c.rname5p in mask_rnames: continue if c.rname3p in mask_rnames: continue print >>f, str(c) num_kept_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Kept chimeras: %d" % num_kept_chimeras) return config.JOB_SUCCESS
def filter_chimeras(input_file, output_file, filter_num_frags, filter_allele_fraction, mask_biotypes, mask_rnames): logging.debug("\tfragments: %f" % (filter_num_frags)) logging.debug("\tallele fraction: %f" % (filter_allele_fraction)) logging.debug("\tmask biotypes: %s" % (','.join(sorted(mask_biotypes)))) logging.debug("\tmask references: %s" % (','.join(sorted(mask_rnames)))) # filter chimeras num_chimeras = 0 num_kept_chimeras = 0 f = open(output_file, "w") print >> f, '#' + '\t'.join(Chimera._fields) for c in Chimera.parse(open(input_file)): num_chimeras += 1 # number of fragments if c.num_frags < filter_num_frags: continue # allele fraction allele_fraction_5p = float(c.num_frags) / (c.num_discordant_frags_5p + c.num_concordant_frags_5p) allele_fraction_3p = float(c.num_frags) / (c.num_discordant_frags_3p + c.num_concordant_frags_3p) allele_fraction = min(allele_fraction_5p, allele_fraction_3p) if allele_fraction < filter_allele_fraction: continue # masked biotypes and references if len(mask_biotypes.intersection(c.biotypes_5p)) > 0: continue if len(mask_biotypes.intersection(c.biotypes_3p)) > 0: continue if c.rname5p in mask_rnames: continue if c.rname3p in mask_rnames: continue print >> f, str(c) num_kept_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Kept chimeras: %d" % num_kept_chimeras) return config.JOB_SUCCESS
def parse_sync_chimeras_read_stats(chimera_file, read_stats_file): # group reads by chimera name read_stats_iter = group_by_attr(ChimeraStats.parse(open(read_stats_file)), 'chimera_name') iter_valid = True try: read_chimera_name, stats = read_stats_iter.next() except StopIteration: iter_valid = False stats = [] # group chimeras by name for c in Chimera.parse(open(chimera_file)): while (iter_valid) and (c.name > read_chimera_name): try: read_chimera_name, stats = read_stats_iter.next() except StopIteration: iter_valid = False stats = [] if c.name < read_chimera_name: yield c, [] else: yield c, stats
def chimeras_to_breakpoints(input_file, breakpoint_sorted_chimera_file, breakpoint_map_file, breakpoint_fasta_file, tmp_dir): # sort chimera file by breakpoint name def sortfunc(line): fields = line.strip().split('\t') return fields[Chimera.BREAKPOINT_NAME_FIELD] tempdirs = [tmp_dir] batch_sort(input=input_file, output=breakpoint_sorted_chimera_file, key=sortfunc, buffer_size=32000, tempdirs=tempdirs) # parse and build breakpoint -> chimera map fastafh = open(breakpoint_fasta_file, "w") mapfh = open(breakpoint_map_file, "w") prev_breakpoint_name = None prev_seq = None chimera_names = set() for c in Chimera.parse(open(breakpoint_sorted_chimera_file)): seq = c.breakpoint_seq_5p + c.breakpoint_seq_3p if c.breakpoint_name != prev_breakpoint_name: if len(chimera_names) > 0: # write to fasta print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) # write to map file print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(sorted(chimera_names))) chimera_names = set() prev_seq = seq prev_breakpoint_name = c.breakpoint_name chimera_names.add(c.name) if len(chimera_names) > 0: print >>fastafh, ">%s\n%s" % (prev_breakpoint_name, split_seq(prev_seq)) print >>mapfh, "%s\t%s\t%s" % (prev_breakpoint_name, prev_seq, ",".join(chimera_names)) fastafh.close() mapfh.close()
def get_chimera_groups(input_file, gene_file): # build a lookup table to get gene clusters from transcript name transcript_cluster_map = build_transcript_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates # TODO: can either group by exact breakpoint, or just by # gene cluster # transcript_genome_map = build_transcript_genome_map(open(gene_file)) # group chimeras in the same genomic cluster with the same # breakpoint cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # get cluster of overlapping genes cluster5p = transcript_cluster_map[c.tx_name_5p] cluster3p = transcript_cluster_map[c.tx_name_3p] # get genomic positions of breakpoints #coord5p = transcript_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, transcript_genome_map) #coord3p = transcript_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, transcript_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p)].append(c) # TODO: use this grouping instead? #cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(c) for key,chimeras in cluster_chimera_dict.iteritems(): yield key,chimeras
def make_chimera(cluster_pair, cluster_shelve, transcript_dict, genome_tx_trees, annotation_source): # lookup 5' and 3' clusters cluster5p = cluster_shelve[str(cluster_pair.id5p)] cluster3p = cluster_shelve[str(cluster_pair.id3p)] # get 5' and 3' transcripts transcripts5p = lookup_transcripts(cluster5p, transcript_dict, genome_tx_trees) transcripts3p = lookup_transcripts(cluster3p, transcript_dict, genome_tx_trees) # lookup chimera type and distance chimera_type, distance = get_chimera_type(cluster5p, cluster3p, transcripts5p, transcripts3p, transcript_dict, genome_tx_trees) # format transcript information tx_names_5p, gene_names_5p, biotypes_5p = get_transcript_info( transcripts5p, annotation_source) tx_names_3p, gene_names_3p, biotypes_3p = get_transcript_info( transcripts3p, annotation_source) # make chimera object c = Chimera() c.rname5p = cluster5p.rname c.start5p = cluster5p.start c.end5p = cluster5p.end c.rname3p = cluster3p.rname c.start3p = cluster3p.start c.end3p = cluster3p.end c.chimera_id = "CHIMERA%d" % (cluster_pair.pair_id) frags = set(cluster_pair.qnames) frags.update(cluster_pair.spanning_qnames) c.num_frags = len(frags) c.strand5p = cluster5p.strand c.strand3p = cluster3p.strand c.chimera_type = chimera_type c.distance = distance c.num_discordant_frags = len(cluster_pair.qnames) c.num_spanning_frags = len(cluster_pair.spanning_qnames) c.num_discordant_frags_5p = len(cluster5p.qnames) c.num_discordant_frags_3p = len(cluster3p.qnames) c.num_concordant_frags_5p = cluster5p.concordant_frags c.num_concordant_frags_3p = cluster3p.concordant_frags c.biotypes_5p = sorted(biotypes_5p) c.biotypes_3p = sorted(biotypes_3p) c.genes_5p = sorted(gene_names_5p) c.genes_3p = sorted(gene_names_3p) c.transcripts_5p = sorted(tx_names_5p) c.transcripts_3p = sorted(tx_names_3p) return c
def filter_chimeras(input_file, output_file, index_dir, bam_file, weighted_unique_frags, median_isize, max_isize, isoform_fraction, false_pos_file): logging.debug("Filtering Parameters") logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags)) logging.debug("\tmedian insert size: %d" % (median_isize)) logging.debug("\tmax insert size allowed: %d" % (max_isize)) logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction)) logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file)) # get false positive chimera list if (false_pos_file is not None) and (false_pos_file is not ""): logging.debug("Parsing false positive chimeras") false_pos_pairs = read_false_pos_file(false_pos_file) else: false_pos_pairs = set() # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # filter chimeras logging.debug("Checking chimeras") num_chimeras = 0 num_filtered_chimeras = 0 tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt") f = open(tmp_file, "w") for c in Chimera.parse(open(input_file)): num_chimeras += 1 good = filter_weighted_frags(c, weighted_unique_frags) if not good: continue good = good and filter_inner_dist(c, max_isize) if not good: continue false_pos_key = (c.partner5p.tx_name, c.partner5p.end, c.partner3p.tx_name, c.partner3p.start) good = good and (false_pos_key not in false_pos_pairs) if not good: continue good = good and filter_chimeric_isoform_fraction(c, isoform_fraction, median_isize, bamfh) if good: print >>f, '\t'.join(map(str, c.to_list())) num_filtered_chimeras += 1 f.close() logging.debug("Total chimeras: %d" % num_chimeras) logging.debug("Filtered chimeras: %d" % num_filtered_chimeras) # cleanup memory for false positive chimeras del false_pos_pairs bamfh.close() # find highest coverage chimeras among isoforms gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file) num_filtered_chimeras = 0 f = open(output_file, "w") for c in Chimera.parse(open(tmp_file)): if c.name in kept_chimeras: num_filtered_chimeras += 1 print >>f, '\t'.join(map(str, c.to_list())) f.close() logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras) os.remove(tmp_file) return config.JOB_SUCCESS
def filter_homologous_genes(input_file, output_file, index_dir, homolog_segment_length, min_isize, max_isize, maxhits, num_processors, tmp_dir): logging.debug("Parameters") logging.debug("\thomolog segment length: %d" % (homolog_segment_length)) logging.debug("\tmin fragment size: %d" % (min_isize)) logging.debug("\tmax fragment size: %d" % (max_isize)) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) interval_trees_3p = collections.defaultdict(lambda: IntervalTree()) # generate FASTA file of sequences to use in mapping logging.debug("Generating homologous sequences to test") fasta5p = os.path.join(tmp_dir, "homologous_5p.fa") f = open(fasta5p, "w") for c in Chimera.parse(open(input_file)): start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length) # add 3' gene to interval trees interval_trees_3p[c.tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name)) # extract sequence of 5' gene seq5p = ref_fa.fetch(c.tx_name_5p, start5p, end5p) for i in xrange(0, len(seq5p) - homolog_segment_length): print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p, start5p+i, start5p+i+homolog_segment_length, seq5p[i:i+homolog_segment_length]) f.close() # map 5' sequences to reference using bowtie logging.debug("Mapping homologous sequences") bowtie2_index = os.path.join(index_dir, config.TRANSCRIPTOME_INDEX) sam5p = os.path.join(tmp_dir, "homologous_5p.sam") args = [config.BOWTIE2_BIN, '-p', num_processors, '--phred33', '--end-to-end', '--very-sensitive', '--reorder', '-f', '-k', maxhits, '-x', bowtie2_index, '-U', fasta5p, "-S", sam5p] retcode = subprocess.call(map(str,args)) if retcode != 0: return config.JOB_ERROR # analyze results for homologous genes logging.debug("Analyzing mapping results") samfh = pysam.Samfile(sam5p, "r") tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references)) homologous_chimeras = set() for r in pysam.Samfile(sam5p, "r"): if r.is_unmapped: continue # reference name must be in list of 3' chimeras rname = tid_rname_map[r.tid] if rname not in interval_trees_3p: continue # get chimera name from 'qname' chimera_name = r.qname.split(",")[0] for hit in interval_trees_3p[rname].find(r.pos,r.aend): if hit.value == chimera_name: homologous_chimeras.add(chimera_name) # write output logging.debug("Writing output") f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in homologous_chimeras: logging.debug("Removing homologous chimera %s between %s and %s" % (c.name, c.gene_name_5p, c.gene_name_3p)) continue print >>f, '\t'.join(map(str, c.to_list())) f.close() # cleanup if os.path.exists(fasta5p): os.remove(fasta5p) if os.path.exists(sam5p): os.remove(sam5p) return config.JOB_SUCCESS
def chimeras_to_breakpoints(input_file, breakpoint_map_file, breakpoint_fasta_file): # now extract the unique junction sequences # and write them to a fasta file breakpointfh = open(breakpoint_map_file, "w") fasta_output_fh = open(breakpoint_fasta_file, "w") for c in Chimera.parse(open(input_file)): for seq,b in breakpoints.iteritems(): # write to fasta file print >>fasta_output_fh, ">%s\n%s" % (b.name, seq) # write to breakpoint map file fields = b.to_list() print >>breakpointfh, '\t'.join(map(str, fields)) # close files fasta_output_fh.close() breakpointfh.close() chimerafh.close() def main(): from optparse import OptionParser logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <chimeras.bedpe> " "<breakpoints.txt> <breakpoints.fa") options, args = parser.parse_args() input_file = args[0] breakpoint_map_file = args[1] breakpoint_fasta_file = args[2] chimeras_to_breakpoints(input_file, breakpoint_map_file, breakpoint_fasta_file) def main(): from optparse import OptionParser logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") parser = OptionParser("usage: %prog [options] <index> <read_length> " "<chimeras.txt> <chimeras.out.txt> " "<breakpoints.txt> <breakpoints.fa>") parser.add_option("--homology-mismatches", type="int", dest="homology_mismatches", default=config.BREAKPOINT_HOMOLOGY_MISMATCHES, help="Number of mismatches to tolerate when computing " "homology between gene and its chimeric partner " "[default=%default]") options, args = parser.parse_args() index_dir = args[0] read_length = int(args[1]) input_chimera_file = args[2] output_chimera_file = args[3] breakpoint_map_file = args[4] breakpoint_fasta_file = args[5] determine_chimera_breakpoints(index_dir, read_length, input_chimera_file, output_chimera_file, breakpoint_map_file, breakpoint_fasta_file, homology_mismatches=options.homology_mismatches) if __name__ == '__main__': main()
def determine_chimera_breakpoints(index_dir, read_length, input_chimera_file, output_chimera_file, breakpoint_map_file, breakpoint_fasta_file, homology_mismatches=DEFAULT_HOMOLOGY_MISMATCHES): # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # output files chimerafh = open(output_chimera_file, "w") breakpointfh = open(breakpoint_map_file, "w") fasta_output_fh = open(breakpoint_fasta_file, "w") breakpoints = collections.defaultdict(lambda: []) breaknum = 0 for c in Chimera.parse(open(input_chimera_file)): # retrieve transcript coordinates of 5' and 3' partners ref5p = config.GENE_REF_PREFIX + c.partner5p.tx_name ref3p = config.GENE_REF_PREFIX + c.partner3p.tx_name start5p, end5p = c.partner5p.start, c.partner5p.end start3p, end3p = c.partner3p.start, c.partner3p.end # get intervals for breakpoint sequence breakpoint_start5p = max(start5p, end5p - read_length + 1) breakpoint_end3p = min(end3p, start3p + read_length - 1) # fetch sequence seq5p = ref_fa.fetch(ref5p, breakpoint_start5p, end5p) seq3p = ref_fa.fetch(ref3p, start3p, breakpoint_end3p) if len(seq5p) < read_length - 1: logging.warning("Could not extract sequence of length >%d from " "5' partner of chimera %s, only retrieved " "sequence of %d" % (read_length-1, c.name, len(seq5p))) # pad sequence padding = (read_length - 1) - len(seq5p) seq5p = ("N" * padding) + seq5p if len(seq3p) < read_length - 1: logging.warning("Could not extract sequence of length >%d from " "3' partner of chimera %s, only retrieved " "sequence of %d" % (read_length-1, c.name, len(seq3p))) # pad sequence padding = (read_length - 1) - len(seq3p) seq3p = seq3p + ("N" * padding) # fetch continuation sequence of non-fusion gene homolog_end5p = end5p + read_length - 1 homolog_start3p = max(0, start3p - read_length + 1) homolog5p = ref_fa.fetch(ref3p, homolog_start3p, start3p) homolog3p = ref_fa.fetch(ref5p, end5p, homolog_end5p) # find homology between 5' gene and 3' gene homology_length_5p = calc_homology(seq5p[::-1], homolog5p[::-1], homology_mismatches) homology_length_3p = calc_homology(seq3p, homolog3p, homology_mismatches) # create a Breakpoint and add to dictionary seq = seq5p + seq3p if seq in breakpoints: b = breakpoints[seq] else: b = Breakpoint() b.name = "B%07d" % (breaknum) breaknum += 1 b.seq5p = seq5p b.seq3p = seq3p breakpoints[seq] = b # add sequence to dictionary and group fusion candidates together # if they have the same location and junction sequence b.chimera_names.append(c.name) # update Chimera object with breakpoint information c.breakpoint_name = b.name c.breakpoint_homology_5p = homology_length_5p c.breakpoint_homology_3p = homology_length_3p # write Chimera fields = c.to_list() print >>chimerafh, '\t'.join(map(str, c.to_list())) # now extract the unique junction sequences # and write them to a fasta file for seq,b in breakpoints.iteritems(): # write to fasta file print >>fasta_output_fh, ">%s\n%s" % (b.name, seq) # write to breakpoint map file fields = b.to_list() print >>breakpointfh, '\t'.join(map(str, fields)) # close files fasta_output_fh.close() breakpointfh.close() chimerafh.close()
def make_chimera(cluster_pair, cluster_shelve, transcript_dict, genome_tx_trees, annotation_source): # lookup 5' and 3' clusters cluster5p = cluster_shelve[str(cluster_pair.id5p)] cluster3p = cluster_shelve[str(cluster_pair.id3p)] # get 5' and 3' transcripts transcripts5p = lookup_transcripts(cluster5p, transcript_dict, genome_tx_trees) transcripts3p = lookup_transcripts(cluster3p, transcript_dict, genome_tx_trees) # lookup chimera type and distance chimera_type, distance = get_chimera_type(cluster5p, cluster3p, transcripts5p, transcripts3p, transcript_dict, genome_tx_trees) # format transcript information tx_names_5p, gene_names_5p, biotypes_5p = get_transcript_info(transcripts5p, annotation_source) tx_names_3p, gene_names_3p, biotypes_3p = get_transcript_info(transcripts3p, annotation_source) # make chimera object c = Chimera() c.rname5p = cluster5p.rname c.start5p = cluster5p.start c.end5p = cluster5p.end c.rname3p = cluster3p.rname c.start3p = cluster3p.start c.end3p = cluster3p.end c.chimera_id = "CHIMERA%d" % (cluster_pair.pair_id) frags = set(cluster_pair.qnames) frags.update(cluster_pair.spanning_qnames) c.num_frags = len(frags) c.strand5p = cluster5p.strand c.strand3p = cluster3p.strand c.chimera_type = chimera_type c.distance = distance c.num_discordant_frags = len(cluster_pair.qnames) c.num_spanning_frags = len(cluster_pair.spanning_qnames) c.num_discordant_frags_5p = len(cluster5p.qnames) c.num_discordant_frags_3p = len(cluster3p.qnames) c.num_concordant_frags_5p = cluster5p.concordant_frags c.num_concordant_frags_3p = cluster3p.concordant_frags c.biotypes_5p = sorted(biotypes_5p) c.biotypes_3p = sorted(biotypes_3p) c.genes_5p = sorted(gene_names_5p) c.genes_3p = sorted(gene_names_3p) c.transcripts_5p = sorted(tx_names_5p) c.transcripts_3p = sorted(tx_names_3p) return c
def filter_homologous_genes(input_file, output_file, index_dir, homolog_segment_length, min_isize, max_isize, bowtie_bin, num_processors, tmp_dir): logging.debug("Parameters") logging.debug("\thomolog segment length: %d" % (homolog_segment_length)) logging.debug("\tmin fragment size: %d" % (min_isize)) logging.debug("\tmax fragment size: %d" % (max_isize)) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) bowtie_index = os.path.join(index_dir, config.ALIGN_INDEX) interval_trees_3p = collections.defaultdict(lambda: IntervalTree()) # generate FASTA file of sequences to use in mapping logging.debug("Generating homologous sequences to test") fasta5p = os.path.join(tmp_dir, "homologous_5p.fa") f = open(fasta5p, "w") for c in Chimera.parse(open(input_file)): tx_name_5p = config.GENE_REF_PREFIX + c.tx_name_5p tx_name_3p = config.GENE_REF_PREFIX + c.tx_name_3p start5p, end5p, start3p, end3p = get_mapped_read_intervals(c, min_isize, max_isize, homolog_segment_length) # add 3' gene to interval trees interval_trees_3p[tx_name_3p].insert_interval(Interval(start3p, end3p, value=c.name)) # extract sequence of 5' gene seq5p = ref_fa.fetch(tx_name_5p, start5p, end5p) for i in xrange(0, len(seq5p) - homolog_segment_length): print >>f, ">%s,%s:%d-%d\n%s" % (c.name,c.tx_name_5p, start5p+i, start5p+i+homolog_segment_length, seq5p[i:i+homolog_segment_length]) f.close() # map 5' sequences to reference using bowtie logging.debug("Mapping homologous sequences") sam5p = os.path.join(tmp_dir, "homologous_5p.sam") args = [bowtie_bin, "-p", num_processors, "-f", "-a", "-m", 100, "-y", "-v", 3, "-S", bowtie_index, fasta5p, sam5p] retcode = subprocess.call(map(str,args)) if retcode != 0: return config.JOB_ERROR # analyze results for homologous genes logging.debug("Analyzing mapping results") samfh = pysam.Samfile(sam5p, "r") tid_rname_map = dict((i,refname) for i,refname in enumerate(samfh.references)) homologous_chimeras = set() for r in pysam.Samfile(sam5p, "r"): if r.is_unmapped: continue # reference name must be in list of 3' chimeras rname = tid_rname_map[r.rname] if rname not in interval_trees_3p: continue # get chimera name from 'qname' chimera_name = r.qname.split(",")[0] for hit in interval_trees_3p[rname].find(r.pos,r.aend): if hit.value == chimera_name: homologous_chimeras.add(chimera_name) # write output logging.debug("Writing output") f = open(output_file, "w") for c in Chimera.parse(open(input_file)): if c.name in homologous_chimeras: logging.debug("Removing homologous chimera %s between %s and %s" % (c.name, c.gene_name_5p, c.gene_name_3p)) continue print >>f, '\t'.join(map(str, c.to_list())) f.close() # cleanup if os.path.exists(fasta5p): os.remove(fasta5p) if os.path.exists(sam5p): os.remove(sam5p) return config.JOB_SUCCESS