예제 #1
0
파일: bam.py 프로젝트: jmeppley/transabyss
    def confirm_novel(self, region, novel_seq, breakpoint, min_olap=10, max_unmapped=1000):
	"""Given novel sequence, find read-pairs that have one unmapped mate that overlaps (with minimum overlap) with novel sequence
	Note of maximum number of pairs with unmapped mates examined (1000)
	Reports the number of pairs that have their unmapped mates entirely subsumed
	or overlap the novel sequence
	"""
	# capture reads with unmapped mates
	unmapped = {}
	# for not going over maximum
	count = 0
	for read in self.bam.fetch(region[0], region[1], region[2]):
	    if read.is_paired and read.mate_is_unmapped:
		count += 1
		unmapped[read.qname] = read
		
		if count == max_unmapped:
		    break
		
	found_mates = 0
	if unmapped:
	    # this assumes unmapped mates is put under same location of mapped mates
	    # store sequences of unmapped mates
	    mate_seq = self.get_unmapped_mate_seq(region[0], unmapped.values())
	    
	    for read in unmapped.values():
		# make anchored read is pointing towards breakpoint
		if (breakpoint == region[1] and not read.is_reverse) or \
		   (breakpoint == region[2] and read.is_reverse):
		    continue		
		
		if mate_seq.has_key(read.qname):
		    if read.pos > region[1]:
			from_end = read.pos - region[1] + 1
		    else:
			from_end = region[1] - read.pos + 1
					    
		    m = re.search(mate_seq[read.qname], novel_seq, re.IGNORECASE)
		    if m:
			found_mates += 1
			
		    else:
			m = re.search(reverse_complement(mate_seq[read.qname]), novel_seq, re.IGNORECASE)
			if m:
			    found_mates += 1
			    
			# if unmapped mate sequence (with or without reverse complement)
			# is not entirely embedded in novel sequence
			# check if it overlaps with novel sequence at the edges
			# if it overlaps at least 10 bases, keep it
			else:
			    olap = seq_overlap(mate_seq[read.qname], novel_seq)
			    if olap > min_olap:
				found_mates += 1
				
			    else:
				olap = seq_overlap(reverse_complement(mate_seq[read.qname]), novel_seq)
				if olap > min_olap:
				    found_mates += 1
				    			    
	return found_mates
예제 #2
0
파일: bam.py 프로젝트: jmeppley/transabyss
    def confirm_perfect(self, target, pos, allele, strand, seq=False, min_from_end=0, expansion=0):
        """Confirms every base in the target is the same as allele
	This is an old method, probably needs to be re-written
	"""
	# makes sure coordinates are integers
	pos[0] = int(pos[0])
	pos[1] = int(pos[1])
	# makes sure coordinate starts from 1
	pos[0] = max(pos[0], 1)
	
        num_reads = 0
        reads = []
	reads_dict = {}
	
	# re-construct allele if this is a repeat expansion
	if expansion > 1:
	    i = 1
	    seq = allele
	    while i < expansion:
		allele = allele + seq
		i = i + 1
	
	# for alleles longer than read length
	coverage = {}
	for i in range(pos[0], pos[1]+1):
	    coverage[i] = 0
	    	
	shorter_than_read = False
	read_lens = {}
        for read in self.bam.fetch(target, pos[0], pos[1] + 1):
	    if len(allele) <= read.rlen:
		shorter_than_read = True
		if read.pos+1 <= pos[0] and read.pos+read.rlen >= pos[1]:
		    if not read_lens.has_key(read.rlen):
			read_lens[read.rlen] = 0
		    read_lens[read.rlen] += 1
		    
		    from_end = min(pos[0] - read.pos -1, read.pos + read.rlen - pos[1])
		    offset = pos[0]-read.pos-1
		    bases = read.seq[offset:offset+len(allele)]
		
		    # needs to reverse-complement bases if contig is aligned to - strand, 
		    # as allele is given as + strand of reference
		    if strand == '-':
			bases = reverse_complement(bases)
		
		    if from_end < min_from_end:
			continue
		    
		    elif bases.lower() == allele.lower():
			if seq:
			    reads.append(read)
			num_reads += 1
		    		
	read_len = None
	if read_lens:
	    sorted_read_lens = sorted(read_lens.iteritems(), key=operator.itemgetter(1))
	    read_len = sorted_read_lens[0][0]
	    
	# none of the reads is longer than the allele(insertion), or no read within pos
	# not sure condition: len(allele) > read_len / 2?
	if not shorter_than_read or (not reads and read_len is not None and len(allele) > read_len / 2):
	    allele_seq = allele
	    if strand == '-':
		allele_seq = reverse_complement(allele)
		
	    # check if allele is covered by reads
	    for pileupcolumn in self.bam.pileup(target, pos[0]-1, pos[1]):
		if pileupcolumn.pos >= pos[0]-1 and pileupcolumn.pos <= pos[1]-1:
		    cov = pileupcolumn.n
		    
		    for pileupread in pileupcolumn.pileups:
			if read_len is None:
			    if not read_lens.has_key(pileupread.alignment.rlen):
				read_lens[pileupread.alignment.rlen] = 0
			    read_lens[pileupread.alignment.rlen] += 1
			    			
			# pysam bug?
			if pileupread.qpos >= len(pileupread.alignment.seq):
			    continue
						
			if pileupread.alignment.seq[pileupread.qpos].lower() != allele_seq[pileupcolumn.pos-pos[0]+1]:
			    cov = cov - 1
			else:
			    reads_dict[pileupread.alignment.qname] = pileupread.alignment
		    coverage[pileupcolumn.pos + 1] = cov
		    			    	    
	    # determine if allele is entirely covered
	    gaps = False
	    for i in range(pos[0], pos[1]+1):
		if coverage[i] == 0:
		    gaps = True
		    break
		
	    if read_len is None and read_lens:
		sorted_read_lens = sorted(read_lens.iteritems(), key=operator.itemgetter(1))
		read_len = sorted_read_lens[0][0]
		
	    if not gaps and (read_len is not None and len(allele) > read_len / 2):
		if seq:
		    reads = reads_dict.values()
		else:
		    num_reads = len(reads_dict.keys())
	
        if seq:
            return reads
        else:
            return num_reads