def filter_spanning_reads(chimeras, reads, anchor_min, anchor_length, anchor_mismatches, library_type): for i,r in enumerate(reads): if r.is_unmapped: continue # make a discordant read object # TODO: need to annotate reads elsewhere since they have already been sorted here r.tags = r.tags + [("HI", 0), ("IH", 1), ("NH", 1), (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE), (ORIENTATION_TAG_NAME, get_orientation(r, library_type))] dr = DiscordantRead.from_read(r) dr.is_spanning = True # check read alignment against chimeras for c in chimeras: if check_breakpoint_alignment(c, r, anchor_min, anchor_length, anchor_mismatches): # valid spanning read yield c,dr
def extract_single_mapped_reads(chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(unmapped_fastq_file, "w") # annotate mapped reads with sequence/quality of unmapped mate bamfh = pysam.Samfile(unmapped_bam_file, "rb") unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam") singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh) for pe_reads in parse_pe_reads(bamfh): # find which of the original reads was unmapped r1_unmapped = any(r.is_unmapped for r in pe_reads[0]) r2_unmapped = any(r.is_unmapped for r in pe_reads[1]) # if both reads unmapped, then remap to breakpoints if r1_unmapped and r2_unmapped: for readnum in (0,1): print >>fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) else: # annotate the mapped reads with the seq/qual of the # unmapped reads mapped_readnum = 0 if r2_unmapped else 1 unmapped_readnum = 1 if r2_unmapped else 0 unmapped_seq = pe_reads[unmapped_readnum][0].seq unmapped_qual = pe_reads[unmapped_readnum][0].qual for r in pe_reads[mapped_readnum]: orientation = get_orientation(r, library_type) # TODO: may need to REVERSE read here to get original r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)] singlemap_bamfh.write(r) singlemap_bamfh.close() fqfh.close() # sort/index the annotated single-mapper unmapped reads by reference/position logging.debug("Sorting single-mapped mates by reference") single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0] pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix) pysam.index(single_mapped_bam_file) # remove unsorted file if os.path.exists(unsorted_single_mapped_bam_file): os.remove(unsorted_single_mapped_bam_file) return config.JOB_SUCCESS
def extract_single_mapped_reads(chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(unmapped_fastq_file, "w") # annotate mapped reads with sequence/quality of unmapped mate bamfh = pysam.Samfile(unmapped_bam_file, "rb") unsorted_single_mapped_bam_file = os.path.join( tmp_dir, "unsorted_single_mapped_reads.bam") singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh) for pe_reads in parse_pe_reads(bamfh): # find which of the original reads was unmapped r1_unmapped = any(r.is_unmapped for r in pe_reads[0]) r2_unmapped = any(r.is_unmapped for r in pe_reads[1]) # if both reads unmapped, then remap to breakpoints if r1_unmapped and r2_unmapped: for readnum in (0, 1): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) else: # annotate the mapped reads with the seq/qual of the # unmapped reads mapped_readnum = 0 if r2_unmapped else 1 unmapped_readnum = 1 if r2_unmapped else 0 unmapped_seq = pe_reads[unmapped_readnum][0].seq unmapped_qual = pe_reads[unmapped_readnum][0].qual for r in pe_reads[mapped_readnum]: orientation = get_orientation(r, library_type) # TODO: may need to REVERSE read here to get original r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)] singlemap_bamfh.write(r) singlemap_bamfh.close() fqfh.close() # sort/index the annotated single-mapper unmapped reads by reference/position logging.debug("Sorting single-mapped mates by reference") single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0] pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix) pysam.index(single_mapped_bam_file) # remove unsorted file if os.path.exists(unsorted_single_mapped_bam_file): os.remove(unsorted_single_mapped_bam_file) return config.JOB_SUCCESS
def classify_unpaired_reads(reads, library_type): gene_hits_5p = [] gene_hits_3p = [] for r in reads: # this alignment is to a transcript (gene), so need # to determine whether it is 5' or 3' orientation = get_orientation(r, library_type) if orientation == ORIENTATION_5P: gene_hits_5p.append(r) else: gene_hits_3p.append(r) # add a tag to the sam file describing the read orientation and # that it is discordant r.tags = r.tags + [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE), (ORIENTATION_TAG, orientation)] return gene_hits_5p, gene_hits_3p
def classify_unpaired_reads(reads, library_type): gene_hits_5p = [] gene_hits_3p = [] for r in reads: # this alignment is to a transcript (gene), so need # to determine whether it is 5' or 3' orientation = get_orientation(r, library_type) if orientation == ORIENTATION_5P: gene_hits_5p.append(r) else: gene_hits_3p.append(r) # add a tag to the sam file describing the read orientation and # that it is discordant r.tags = r.tags + [ (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE), (ORIENTATION_TAG, orientation) ] return gene_hits_5p, gene_hits_3p
def filter_spanning_reads(chimeras, reads, anchor_min, anchor_length, anchor_mismatches, library_type): for i, r in enumerate(reads): if r.is_unmapped: continue # make a discordant read object # TODO: need to annotate reads elsewhere since they have already been sorted here r.tags = r.tags + [ ("HI", 0), ("IH", 1), ("NH", 1), (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE), (ORIENTATION_TAG_NAME, get_orientation(r, library_type)) ] dr = DiscordantRead.from_read(r) dr.is_spanning = True # check read alignment against chimeras for c in chimeras: if check_breakpoint_alignment(c, r, anchor_min, anchor_length, anchor_mismatches): # valid spanning read yield c, dr
def write_unpaired_reads(pe_reads, mate_num_hits, library_type, bamfh): """ write reads that have one mate mapped and the other unmapped. this function adds the 'R2' and 'Q2' SAM tags to the mapped mate alignments in order to capture the unmapped mate information and does not write the unmapped mate reads """ if mate_num_hits[0] == 0: unmapped_read = pe_reads[0][0] mapped_reads = pe_reads[1] else: unmapped_read = pe_reads[1][0] mapped_reads = pe_reads[0] for r in mapped_reads: # find whether read is 5' or 3' orientation orientation = get_orientation(r, library_type) # add tags containing the seq and quals of the mate r.tags = r.tags + [('R2', unmapped_read.seq), ('Q2', unmapped_read.qual), (ORIENTATION_TAG, orientation)] bamfh.write(r)