def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[], []] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[],[]] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def find_discordant_pairs(pe_reads, library_type): """ iterate through combinations of read1/read2 to predict valid discordant read pairs """ # classify the reads as 5' or 3' gene alignments or genome alignments r1_5p_gene_hits, r1_3p_gene_hits = classify_unpaired_reads(pe_reads[0], library_type) r2_5p_gene_hits, r2_3p_gene_hits = classify_unpaired_reads(pe_reads[1], library_type) # pair 5' and 3' gene alignments gene_pairs = [] combos = [(r1_5p_gene_hits, r2_3p_gene_hits), (r1_3p_gene_hits, r2_5p_gene_hits)] for r1_list, r2_list in combos: for r1 in r1_list: for r2 in r2_list: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1, cr2) gene_pairs.append((cr1, cr2)) return gene_pairs
def convert_read(r, transcript_tid_map, library_type): if r.is_unmapped: # return copy of original read return copy_read(r) # copy and modify tags tagdict = collections.OrderedDict(r.tags) if 'XS' in tagdict: del tagdict['XS'] if 'NH' in tagdict: del tagdict['NH'] # convert transcript reference to genome genome_tid, negstrand, exons = transcript_tid_map[r.tid] # find genomic start position of transcript newpos, eindex, testart, toffset = convert_pos(r.pos, negstrand, exons) # parse and convert transcript cigar string newcigar, alen, spliced = \ convert_cigar(r.cigar, negstrand, exons, eindex, testart, toffset) if negstrand: # set position to left end of transcript newpos = newpos - alen + 1 # flip is_reverse flag is_reverse = (not r.is_reverse) # reverse complement seq and quals seq = DNA_reverse_complement(r.seq) qual = None if r.qual is None else r.qual[::-1] # flip MD tag if 'MD' in tagdict: tagdict['MD'] = reverse_complement_MD_tag(tagdict['MD']) else: is_reverse = r.is_reverse seq = r.seq qual = r.qual # add XS tag strand = get_read_strand(r.is_read2, is_reverse, negstrand, library_type) tagdict['XS'] = strand # create copy of read a = pysam.AlignedRead() a.qname = r.qname a.flag = r.flag a.seq = seq a.qual = qual a.is_reverse = is_reverse a.tid = genome_tid a.pos = newpos a.cigar = newcigar a.mapq = r.mapq a.rnext = r.rnext a.pnext = r.pnext a.tlen = r.tlen a.tags = tuple(tagdict.iteritems()) return a
def find_discordant_pairs(pe_reads, library_type): """ iterate through combinations of read1/read2 to predict valid discordant read pairs """ # classify the reads as 5' or 3' gene alignments or genome alignments r1_5p_gene_hits, r1_3p_gene_hits = \ classify_unpaired_reads(pe_reads[0], library_type) r2_5p_gene_hits, r2_3p_gene_hits = \ classify_unpaired_reads(pe_reads[1], library_type) # pair 5' and 3' gene alignments gene_pairs = [] combos = [(r1_5p_gene_hits, r2_3p_gene_hits), (r1_3p_gene_hits, r2_5p_gene_hits)] for r1_list, r2_list in combos: for r1 in r1_list: for r2 in r2_list: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1, cr2) gene_pairs.append((cr1, cr2)) return gene_pairs
def find_discordant_pairs(pe_reads, tid_genome_map, library_type): """ iterate through combinations of read1/read2 to predict valid discordant read pairs """ # classify the reads as 5' or 3' gene alignments or genome alignments r1_5p_gene_hits, r1_3p_gene_hits, r1_genome_hits = \ classify_unpaired_reads(pe_reads[0], tid_genome_map, library_type) r2_5p_gene_hits, r2_3p_gene_hits, r2_genome_hits = \ classify_unpaired_reads(pe_reads[1], tid_genome_map, library_type) # pair 5' and 3' gene alignments gene_pairs = [] combos = [(r1_5p_gene_hits,r2_3p_gene_hits), (r1_3p_gene_hits,r2_5p_gene_hits)] for r1_list,r2_list in combos: for r1 in r1_list: for r2 in r2_list: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1,cr2) gene_pairs.append((cr1,cr2)) # pair genome alignments genome_pairs = [] for r1 in r1_genome_hits: for r2 in r2_genome_hits: cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1,cr2) genome_pairs.append((cr1,cr2)) if len(gene_pairs) > 0 or len(genome_pairs) > 0: return gene_pairs, genome_pairs, [] # if no pairs were found, then we can try to pair gene reads # with genome reads pairs = [] combos = [(r1_5p_gene_hits, r2_genome_hits), (r1_3p_gene_hits, r2_genome_hits), (r1_genome_hits, r2_5p_gene_hits), (r1_genome_hits, r2_3p_gene_hits)] for r1_list,r2_list in combos: for r1 in r1_list: for r2 in r2_list: # check orientation compatibility if cmp_orientation(r1.opt(ORIENTATION_TAG_NAME), r2.opt(ORIENTATION_TAG_NAME)): cr1 = copy_read(r1) cr2 = copy_read(r2) pair_reads(cr1,cr2) pairs.append((cr1,cr2)) return [],[],pairs
def classify_read_pairs(pe_reads, max_isize, library_type, tid_genome_map, tid_tx_cluster_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple with the following lists: 1) pairs (r1,r2) aligning to genes (pairs may be discordant) 2) pairs (r1,r2) aligning to genome (pairs may be discordant) 3) unpaired reads, if any """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_gene_pairs = [] discordant_gene_pairs = [] concordant_genome_pairs = [] discordant_genome_pairs = [] # # first, try to pair reads that map to the same transcript, or to the # genome within the insert size range # same_strand = LibraryTypes.same_strand(library_type) refdict,clusterdict = map_reads_to_references(pe_reads, tid_tx_cluster_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue # check if there are alignments involving both reads in a pair for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # check to see if this tid is a gene or genomic if (tid not in tid_genome_map): # this is a genomic hit so check insert size if r1.pos > r2.pos: isize = r1.aend - r2.pos else: isize = r2.aend - r1.pos if (isize <= max_isize): # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # reads are close to each other on same chromosome # so check strand if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENOME)] concordant_genome_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENOME)] discordant_genome_pairs.append((cr1, cr2)) pair_reads(cr1,cr2,tags) else: # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)] concordant_tx_pairs.append((cr1,cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_gene_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_gene_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_gene_pairs) > 0: gene_pairs = concordant_gene_pairs if len(gene_pairs) > 0 or len(concordant_genome_pairs) > 0: return gene_pairs, concordant_genome_pairs, [] # if no concordant reads in transcripts or genome, return any # discordant reads that may violate strand requirements but still # remain colocalized on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_gene_pairs) > 0: gene_pairs = discordant_gene_pairs if len(gene_pairs) > 0 or len(discordant_genome_pairs) > 0: return gene_pairs, discordant_genome_pairs, [] # # at this point, no read pairings were found so the read is # assumed to be discordant. # # TODO: now that we know that the reads are discordant, no reason # to keep all the mappings hanging around if there is a small subset # with a small number of mismatches. is this the right thing to do # here? # pe_reads = (select_best_mismatch_strata(pe_reads[0]), select_best_mismatch_strata(pe_reads[1])) # # now we can create all valid combinations of read1/read2 as putative # discordant read pairs # gene_pairs, genome_pairs, combo_pairs = \ find_discordant_pairs(pe_reads, tid_genome_map, library_type) if len(gene_pairs) > 0 or len(genome_pairs) > 0: return gene_pairs, genome_pairs, [] elif len(combo_pairs) > 0: return combo_pairs, [], [] # last resort suggests that there are some complex read mappings that # don't make sense and cannot be explained, warranting further # investigation return [], [], pe_reads
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple containing 3 lists: 1) concordant (r1,r2) pairs 2) discordant (r1,r2) pairs 3) unpaired reads """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_cluster_pairs = [] discordant_cluster_pairs = [] # # first, try to pair reads that map to the same transcript or # cluster or overlapping transcripts # same_strand = LibraryTypes.same_strand(library_type) refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == ( r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX) ] concordant_tx_pairs.append((cr1, cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1, cr2)) pair_reads(cr1, cr2, tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == ( r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_cluster_pairs.append((cr1, cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_cluster_pairs.append((cr1, cr2)) pair_reads(cr1, cr2, tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_cluster_pairs) > 0: gene_pairs = concordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # if no concordant reads in transcripts, return any discordant reads # that may violate strand requirements but still remain colocalized # on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_cluster_pairs) > 0: gene_pairs = discordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # # at this point, no read pairings were found so the read is # assumed to be discordant. now we can create all valid # combinations of read1/read2 as putative discordant read pairs # pairs = find_discordant_pairs(pe_reads, library_type) if len(pairs) > 0: # sort valid pairs by sum of alignment score and retain the best # scoring pairs pairs = select_best_scoring_pairs(pairs) return [], pairs, [] # # no valid pairs could be found suggesting that these alignments are # either artifacts or that the current transcript annotations do not # support this pair # return [], [], pe_reads
def classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map): """ examines all the alignments of a single fragment and tries to find ways to pair reads together. annotates all read pairs with an integer tag corresponding to a value in the DiscordantTags class returns a tuple containing 3 lists: 1) concordant (r1,r2) pairs 2) discordant (r1,r2) pairs 3) unpaired reads """ # to satisfy library type reads must either be on # same strand or opposite strands concordant_tx_pairs = [] discordant_tx_pairs = [] concordant_cluster_pairs = [] discordant_cluster_pairs = [] # # first, try to pair reads that map to the same transcript or # cluster or overlapping transcripts # same_strand = LibraryTypes.same_strand(library_type) refdict, clusterdict = map_reads_to_references(pe_reads, tid_tx_map) found_pair = False for tid, tid_pe_reads in refdict.iteritems(): # check if there are alignments involving both reads in a pair if len(tid_pe_reads[0]) == 0 or len(tid_pe_reads[1]) == 0: # no paired alignments exist at this reference continue for r1 in tid_pe_reads[0]: for r2 in tid_pe_reads[1]: # read strands must agree with library type strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) # this is a hit to same transcript (gene) # pair the reads if strand comparison is correct if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_TX)] concordant_tx_pairs.append((cr1,cr2)) else: # hit to same gene with wrong strand, which # could happen in certain wacky cases tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_TX)] discordant_tx_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, if we have not been able to find a suitable way # to pair the reads, then search within the transcript cluster if not found_pair: for cluster_id, cluster_pe_reads in clusterdict.iteritems(): # check if there are alignments involving both reads in a pair if len(cluster_pe_reads[0]) == 0 or len(cluster_pe_reads[1]) == 0: # no paired alignments in this transcript cluster continue for r1 in cluster_pe_reads[0]: for r2 in cluster_pe_reads[1]: # check strand compatibility strand_match = (same_strand == (r1.is_reverse == r2.is_reverse)) # these reads can be paired found_pair = True cr1 = copy_read(r1) cr2 = copy_read(r2) if strand_match: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.CONCORDANT_GENE)] concordant_cluster_pairs.append((cr1,cr2)) else: tags = [(DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_STRAND_GENE)] discordant_cluster_pairs.append((cr1,cr2)) pair_reads(cr1,cr2,tags) # at this point, we have tried all combinations. if any paired reads # are concordant then return them without considering discordant reads gene_pairs = [] if len(concordant_tx_pairs) > 0: gene_pairs = concordant_tx_pairs elif len(concordant_cluster_pairs) > 0: gene_pairs = concordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # if no concordant reads in transcripts, return any discordant reads # that may violate strand requirements but still remain colocalized # on the same gene/chromosome gene_pairs = [] if len(discordant_tx_pairs) > 0: gene_pairs = discordant_tx_pairs elif len(discordant_cluster_pairs) > 0: gene_pairs = discordant_cluster_pairs if len(gene_pairs) > 0: return gene_pairs, [], [] # # at this point, no read pairings were found so the read is # assumed to be discordant. now we can create all valid # combinations of read1/read2 as putative discordant read pairs # pairs = find_discordant_pairs(pe_reads, library_type) if len(pairs) > 0: # sort valid pairs by sum of alignment score and retain the best # scoring pairs pairs = select_best_scoring_pairs(pairs) return [], pairs, [] # # no valid pairs could be found suggesting that these alignments are # either artifacts or that the current transcript annotations do not # support this pair # return [], [], pe_reads