示例#1
0
    def confirm_clipped_reads(self, region, breakpoint, novel_seq, min_clipped=5):
	"""Finds reads that have clipped sequences at the breakpoint and 
	that overlap with the novel sequence
	Minimum of 5 clipped sequence match is considered positive
	Returns the number of positive reads
	"""
	num_clipped_reads = 0	
	for read in self.bam.fetch(region[0], region[1], region[2]):
	    clipped_seq = self.get_clipped_seq(read, breakpoint)
	    
	    if clipped_seq and len(clipped_seq[1]) > min_clipped:
		clipped_seq_matched = False
		
		# whether the clipped sequence is at 'start' or 'end'
		# dictates the order of the overlap
		if clipped_seq[0] == 'start':
		    olap = seq_overlap(novel_seq, read.seq)		    
		    if olap > len(clipped_seq):
			clipped_seq_matched = True
		    
		else:
		    olap = seq_overlap(read.seq, novel_seq)		    
		    if olap > len(clipped_seq):
			clipped_seq_matched = True
			
		if clipped_seq_matched:
		    num_clipped_reads += 1
		
	return num_clipped_reads
示例#2
0
    def confirm_novel(self, region, novel_seq, breakpoint, min_olap=10, max_unmapped=1000):
	"""Given novel sequence, find read-pairs that have one unmapped mate that overlaps (with minimum overlap) with novel sequence
	Note of maximum number of pairs with unmapped mates examined (1000)
	Reports the number of pairs that have their unmapped mates entirely subsumed
	or overlap the novel sequence
	"""
	# capture reads with unmapped mates
	unmapped = {}
	# for not going over maximum
	count = 0
	for read in self.bam.fetch(region[0], region[1], region[2]):
	    if read.is_paired and read.mate_is_unmapped:
		count += 1
		unmapped[read.qname] = read
		
		if count == max_unmapped:
		    break
		
	found_mates = 0
	if unmapped:
	    # this assumes unmapped mates is put under same location of mapped mates
	    # store sequences of unmapped mates
	    mate_seq = self.get_unmapped_mate_seq(region[0], unmapped.values())
	    
	    for read in unmapped.values():
		# make anchored read is pointing towards breakpoint
		if (breakpoint == region[1] and not read.is_reverse) or \
		   (breakpoint == region[2] and read.is_reverse):
		    continue		
		
		if mate_seq.has_key(read.qname):
		    if read.pos > region[1]:
			from_end = read.pos - region[1] + 1
		    else:
			from_end = region[1] - read.pos + 1
					    
		    m = re.search(mate_seq[read.qname], novel_seq, re.IGNORECASE)
		    if m:
			found_mates += 1
			
		    else:
			m = re.search(reverse_complement(mate_seq[read.qname]), novel_seq, re.IGNORECASE)
			if m:
			    found_mates += 1
			    
			# if unmapped mate sequence (with or without reverse complement)
			# is not entirely embedded in novel sequence
			# check if it overlaps with novel sequence at the edges
			# if it overlaps at least 10 bases, keep it
			else:
			    olap = seq_overlap(mate_seq[read.qname], novel_seq)
			    if olap > min_olap:
				found_mates += 1
				
			    else:
				olap = seq_overlap(reverse_complement(mate_seq[read.qname]), novel_seq)
				if olap > min_olap:
				    found_mates += 1
				    			    
	return found_mates