def write_output(input_file, bam_file, output_file, index_dir): # read transcripts logging.debug("Reading transcripts") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(transcripts) tx_id_map = build_transcript_map(transcripts) genome_tx_trees = build_genome_transcript_trees(transcripts) # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # group chimera isoforms together lines = [] chimera_clusters = 0 for key,chimeras in get_chimera_groups(input_file, tx_id_map): txs5p = set() txs3p = set() genes5p = set() genes3p = set() names = set() for c in chimeras: txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1)) txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1)) genes5p.add(c.gene_name_5p) genes3p.add(c.gene_name_3p) names.add(c.name) c = get_best_coverage_chimera(chimeras) # get chimera type and distance between genes chimera_type, distance = get_chimera_type(tx_id_map[c.tx_name_5p], tx_id_map[c.tx_name_3p], genome_tx_trees) # get genomic positions of chimera chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map) chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map) if strand5p == 1: start5p,end5p = end5p,start5p chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map) if strand3p == 1: start3p,end3p = end3p,start3p # get breakpoint spanning sequences spanning_seqs = set() spanning_fasta_lines = [] for dr in c.get_spanning_reads(): if dr.seq in spanning_seqs: continue spanning_seqs.add(dr.seq) spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % (dr.qname, dr.readnum+1, dr.pos, "-" if dr.is_reverse else "+"), dr.seq]) # get isoform fraction num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh) num_chimeric_frags = c.get_num_frags() frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p) frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p) # setup fields of BEDPE file fields = [chrom5p, start5p, end5p, chrom3p, start3p, end3p, "CLUSTER%d" % (chimera_clusters), c.get_num_frags(), "+" if (strand5p == 0) else "-", "+" if (strand3p == 0) else "-", ','.join(txs5p), ','.join(txs3p), ','.join(genes5p), ','.join(genes3p), chimera_type, distance, c.get_num_frags(), c.get_num_spanning_frags(), c.get_num_unique_positions(), frac5p, frac3p, ','.join(spanning_fasta_lines), ','.join(names)] lines.append(fields) chimera_clusters += 1 bamfh.close() logging.debug("Clustered chimeras: %d" % (chimera_clusters)) # sort lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True) f = open(output_file, "w") print >>f, '\t'.join(['#chrom5p', 'start5p', 'end5p', 'chrom3p', 'start3p', 'end3p', 'chimera_cluster_id', 'score', 'strand5p', 'strand3p', 'transcript_ids_5p', 'transcript_ids_3p', 'genes5p', 'genes3p', 'type', 'distance', 'total_frags', 'spanning_frags', 'unique_alignment_positions', 'isoform_fraction_5p', 'isoform_fraction_3p', 'breakpoint_spanning_reads', 'chimera_ids']) for fields in lines: print >>f, '\t'.join(map(str, fields)) f.close() return config.JOB_SUCCESS
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, trim_bp, max_read_length, homology_mismatches): # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript information") transcript_feature_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_feature_file))) tx_id_map = build_transcript_map(transcripts) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE) ref_fa = pysam.Fastafile(ref_fasta_file) # keep track of mapping from breakpoint sequence to breakpoint id # this requires storing all breakpoint sequences in memory which is # potentially expensive. TODO: investigate whether this should be # moved to a separate sort-update-sort procedure breakpoint_seq_name_map = {} breakpoint_num = 1 # group discordant read pairs by gene logging.debug("Parsing discordant reads") chimera_num = 1 outfh = open(output_file, "w") for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair( open(input_file)): # get gene information tx5p = tx_id_map[tx_id_5p] tx3p = tx_id_map[tx_id_3p] # bin fragments into putative breakpoints breakpoint_dict = collections.defaultdict(lambda: []) for dr5p, dr3p in frags: # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, trim_bp, isize_dist) for breakpoint in breakpoints: breakpoint_dict[breakpoint].append((dr5p, dr3p)) # iterate through breakpoints and build chimera candidates for breakpoint, frags in breakpoint_dict.iteritems(): exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches) tx3p_length = sum((end - start) for start, end in tx3p.exons) # get unique breakpoint id based on sequence breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p if breakpoint_seq in breakpoint_seq_name_map: breakpoint_name = breakpoint_seq_name_map[breakpoint_seq] else: breakpoint_name = "B%07d" % (breakpoint_num) breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name breakpoint_num += 1 # write gene, breakpoint, and raw reads to a file and follow the # BEDPE format gene_names_5p = ",".join( sorted(set(["_".join(x.split()) for x in tx5p.gene_names]))) gene_names_3p = ",".join( sorted(set(["_".join(x.split()) for x in tx3p.gene_names]))) fields = [ tx5p.tx_id, 0, tx_end_5p, # chrom1, start1, end1 tx3p.tx_id, tx_start_3p, tx3p_length, # chrom2, start2, end2 "C%07d" % (chimera_num), # name 1.0, # pvalue tx5p.strand, tx3p.strand, # strand1, strand2 gene_names_5p, gene_names_3p, # gene names # exon interval information '%d-%d' % (0, exon_num_5p), '%d-%d' % (exon_num_3p, len(tx3p.exons)), # breakpoint information breakpoint_name, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right, # fragments frags_to_encomp_string(frags), # spanning reads None ] print >> outfh, '\t'.join(map(str, fields)) chimera_num += 1 outfh.close() ref_fa.close() return config.JOB_SUCCESS
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, trim_bp, max_read_length, homology_mismatches): # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript information") transcript_feature_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_feature_file))) tx_id_map = build_transcript_map(transcripts) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE) ref_fa = pysam.Fastafile(ref_fasta_file) # keep track of mapping from breakpoint sequence to breakpoint id # this requires storing all breakpoint sequences in memory which is # potentially expensive. TODO: investigate whether this should be # moved to a separate sort-update-sort procedure breakpoint_seq_name_map = {} breakpoint_num = 1 # group discordant read pairs by gene logging.debug("Parsing discordant reads") chimera_num = 1 outfh = open(output_file, "w") for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair(open(input_file)): # get gene information tx5p = tx_id_map[tx_id_5p] tx3p = tx_id_map[tx_id_3p] # bin fragments into putative breakpoints breakpoint_dict = collections.defaultdict(lambda: []) for dr5p,dr3p in frags: # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, trim_bp, isize_dist) for breakpoint in breakpoints: breakpoint_dict[breakpoint].append((dr5p, dr3p)) # iterate through breakpoints and build chimera candidates for breakpoint,frags in breakpoint_dict.iteritems(): exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(tx_id_5p, tx_end_5p, tx_id_3p, tx_start_3p, ref_fa, max_read_length, homology_mismatches) tx3p_length = sum((end - start) for start,end in tx3p.exons) # get unique breakpoint id based on sequence breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p if breakpoint_seq in breakpoint_seq_name_map: breakpoint_name = breakpoint_seq_name_map[breakpoint_seq] else: breakpoint_name = "B%07d" % (breakpoint_num) breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name breakpoint_num += 1 # write gene, breakpoint, and raw reads to a file and follow the # BEDPE format gene_names_5p = ",".join(sorted(set(["_".join(x.split()) for x in tx5p.gene_names]))) gene_names_3p = ",".join(sorted(set(["_".join(x.split()) for x in tx3p.gene_names]))) fields = [tx5p.tx_id, 0, tx_end_5p, # chrom1, start1, end1 tx3p.tx_id, tx_start_3p, tx3p_length, # chrom2, start2, end2 "C%07d" % (chimera_num), # name 1.0, # pvalue tx5p.strand, tx3p.strand, # strand1, strand2 gene_names_5p, gene_names_3p, # gene names # exon interval information '%d-%d' % (0, exon_num_5p), '%d-%d' % (exon_num_3p, len(tx3p.exons)), # breakpoint information breakpoint_name, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right, # fragments frags_to_encomp_string(frags), # spanning reads None] print >>outfh, '\t'.join(map(str, fields)) chimera_num += 1 outfh.close() ref_fa.close() return config.JOB_SUCCESS