def confirm_novel(self, region, novel_seq, breakpoint, min_olap=10, max_unmapped=1000): """Given novel sequence, find read-pairs that have one unmapped mate that overlaps (with minimum overlap) with novel sequence Note of maximum number of pairs with unmapped mates examined (1000) Reports the number of pairs that have their unmapped mates entirely subsumed or overlap the novel sequence """ # capture reads with unmapped mates unmapped = {} # for not going over maximum count = 0 for read in self.bam.fetch(region[0], region[1], region[2]): if read.is_paired and read.mate_is_unmapped: count += 1 unmapped[read.qname] = read if count == max_unmapped: break found_mates = 0 if unmapped: # this assumes unmapped mates is put under same location of mapped mates # store sequences of unmapped mates mate_seq = self.get_unmapped_mate_seq(region[0], unmapped.values()) for read in unmapped.values(): # make anchored read is pointing towards breakpoint if (breakpoint == region[1] and not read.is_reverse) or \ (breakpoint == region[2] and read.is_reverse): continue if mate_seq.has_key(read.qname): if read.pos > region[1]: from_end = read.pos - region[1] + 1 else: from_end = region[1] - read.pos + 1 m = re.search(mate_seq[read.qname], novel_seq, re.IGNORECASE) if m: found_mates += 1 else: m = re.search(reverse_complement(mate_seq[read.qname]), novel_seq, re.IGNORECASE) if m: found_mates += 1 # if unmapped mate sequence (with or without reverse complement) # is not entirely embedded in novel sequence # check if it overlaps with novel sequence at the edges # if it overlaps at least 10 bases, keep it else: olap = seq_overlap(mate_seq[read.qname], novel_seq) if olap > min_olap: found_mates += 1 else: olap = seq_overlap(reverse_complement(mate_seq[read.qname]), novel_seq) if olap > min_olap: found_mates += 1 return found_mates
def confirm_perfect(self, target, pos, allele, strand, seq=False, min_from_end=0, expansion=0): """Confirms every base in the target is the same as allele This is an old method, probably needs to be re-written """ # makes sure coordinates are integers pos[0] = int(pos[0]) pos[1] = int(pos[1]) # makes sure coordinate starts from 1 pos[0] = max(pos[0], 1) num_reads = 0 reads = [] reads_dict = {} # re-construct allele if this is a repeat expansion if expansion > 1: i = 1 seq = allele while i < expansion: allele = allele + seq i = i + 1 # for alleles longer than read length coverage = {} for i in range(pos[0], pos[1]+1): coverage[i] = 0 shorter_than_read = False read_lens = {} for read in self.bam.fetch(target, pos[0], pos[1] + 1): if len(allele) <= read.rlen: shorter_than_read = True if read.pos+1 <= pos[0] and read.pos+read.rlen >= pos[1]: if not read_lens.has_key(read.rlen): read_lens[read.rlen] = 0 read_lens[read.rlen] += 1 from_end = min(pos[0] - read.pos -1, read.pos + read.rlen - pos[1]) offset = pos[0]-read.pos-1 bases = read.seq[offset:offset+len(allele)] # needs to reverse-complement bases if contig is aligned to - strand, # as allele is given as + strand of reference if strand == '-': bases = reverse_complement(bases) if from_end < min_from_end: continue elif bases.lower() == allele.lower(): if seq: reads.append(read) num_reads += 1 read_len = None if read_lens: sorted_read_lens = sorted(read_lens.iteritems(), key=operator.itemgetter(1)) read_len = sorted_read_lens[0][0] # none of the reads is longer than the allele(insertion), or no read within pos # not sure condition: len(allele) > read_len / 2? if not shorter_than_read or (not reads and read_len is not None and len(allele) > read_len / 2): allele_seq = allele if strand == '-': allele_seq = reverse_complement(allele) # check if allele is covered by reads for pileupcolumn in self.bam.pileup(target, pos[0]-1, pos[1]): if pileupcolumn.pos >= pos[0]-1 and pileupcolumn.pos <= pos[1]-1: cov = pileupcolumn.n for pileupread in pileupcolumn.pileups: if read_len is None: if not read_lens.has_key(pileupread.alignment.rlen): read_lens[pileupread.alignment.rlen] = 0 read_lens[pileupread.alignment.rlen] += 1 # pysam bug? if pileupread.qpos >= len(pileupread.alignment.seq): continue if pileupread.alignment.seq[pileupread.qpos].lower() != allele_seq[pileupcolumn.pos-pos[0]+1]: cov = cov - 1 else: reads_dict[pileupread.alignment.qname] = pileupread.alignment coverage[pileupcolumn.pos + 1] = cov # determine if allele is entirely covered gaps = False for i in range(pos[0], pos[1]+1): if coverage[i] == 0: gaps = True break if read_len is None and read_lens: sorted_read_lens = sorted(read_lens.iteritems(), key=operator.itemgetter(1)) read_len = sorted_read_lens[0][0] if not gaps and (read_len is not None and len(allele) > read_len / 2): if seq: reads = reads_dict.values() else: num_reads = len(reads_dict.keys()) if seq: return reads else: return num_reads