예제 #1
0
    def output_adjacencies(self, adjs, out_file, format, header=None):
	"""Output adjacencies in tsv format
	Args:
	    adjs: (List) Adjacencies
	    out_file: (str) absolute path of output file
	    format: (str) either "tab" or "bedpe"
	    header: (str) header string
	"""
	fn = None
	args = ()
	if format == 'bedpe':
	    fn = 'as_bedpe'
	elif format == 'tab':
	    fn = 'as_tab'

	if not fn is None:
	    out = open(out_file, 'w')
	    if header is not None:
		out.write(header + '\n')

	    if format == 'tab':
		out.write('%s\n' % Adjacency.show_tab_headers())
	    elif format == 'bedpe':
		out.write('%s\n' % Adjacency.show_bedpe_headers())
		
	    for adj in adjs:	    
		output = getattr(adj, fn)(*args)
		try:
		    out.write('%s\n' % output)
		except:
		    sys.stdout.write("can't output Adjacency")
		    
	    out.close()
예제 #2
0
    def create_variants(self, adjs):
	def track_adjs(used_ids, variants):
	    if variants:
		for variant in variants:
		    for adj in variant.adjs:
			used_ids.add(adj.id)
	    #print 'used', used_ids

	"""Creates variants from adjacencies"""
	self.variants = []
	adjs_ids_used = Set()

	split_events = [adj for adj in adjs if not adj.rearrangement in ('trl', 'ins') and adj.align_types[0] == 'split']
	ins_variants, split_events_remained = Adjacency.extract_interchrom_ins(split_events)
	self.variants.extend(ins_variants)
	track_adjs(adjs_ids_used, ins_variants)

	# special cases for imprecise insertions
	ins_variants, ins_adjs = Adjacency.extract_imprecise_ins([adj for adj in adjs if adj.align_types[0] == 'split' and adj.rearrangement != 'inv' and not adj.id in adjs_ids_used],
	                                                         debug=self.debug)
	self.variants.extend(ins_variants)
	track_adjs(adjs_ids_used, ins_variants)
	
	# handle inversions
	invs = [adj for adj in adjs if adj.rearrangement == 'inv' and not adj.id in adjs_ids_used]
	inv_variants = Adjacency.group_inversions(invs)
	self.variants.extend(inv_variants)
	track_adjs(adjs_ids_used, inv_variants)

	# convert translocations to insertions
	trls = [adj for adj in adjs if adj.rearrangement == 'trl' and not adj.id in adjs_ids_used]
	ins_variants, trls_remained = Adjacency.extract_interchrom_ins(trls)
	self.variants.extend(ins_variants)
	track_adjs(adjs_ids_used, ins_variants)

	# group reciprocal transcloations
	trls = [adj for adj in adjs if adj.rearrangement == 'trl' and not adj.id in adjs_ids_used]
	reciprocal_trls, trls_remained = Adjacency.group_trls(trls)
	self.variants.extend(reciprocal_trls)
	track_adjs(adjs_ids_used, reciprocal_trls)

	# append remaining non-dubious translocations
	trls = [adj for adj in adjs if adj.rearrangement == 'trl' and not adj.id in adjs_ids_used]
	for trl in trls:
	    if not trl.dubious:
		variant = Variant('TRL', [trl])

	for adj in adjs:
	    if not adj.id in adjs_ids_used and not adj.dubious:
		self.variants.append(Variant(adj.rearrangement.upper(), [adj]))
예제 #3
0
    def screen_realigns(self, use_realigns=False):
	"""Realign probe sequences of adjacencies and screen results

	- genome, and index_dir must have been set when object is initialized
	- output is always set to "realign.fa" and "realign.bam"
	- will fail Adjacency if probe sequence can align to single location
	"""
	if not self.genome or not self.index_dir:
	    return None
	
	name_sep = '.'
	all_adjs = []
	for variant in self.variants:
	    all_adjs.extend(variant.adjs)
	realign_bam_file = Adjacency.realign(all_adjs,
	                                     self.out_dir,
	                                     probe=True,
	                                     contigs_fasta=self.contig_fasta,
	                                     name_sep=name_sep,
	                                     genome=self.genome, 
	                                     index_dir=self.index_dir,
	                                     num_procs=self.num_procs,
	                                     use_realigns=use_realigns,
	                                     )
	try:
	    bam = pysam.Samfile(realign_bam_file, 'rb')
	except:
	    sys.exit('Error parsing realignment BAM:%s' % realign_bam_file)
	    
	# creates mapping from query to variant and Adjacency
	query_to_variant = {}
	for i in range(len(self.variants)):
	    for j in range(len(self.variants[i].adjs)):
		adj = self.variants[i].adjs[j]
		query = adj.contigs[0] + name_sep + adj.key()
		query_to_variant[query] = (i, j)
		
	failed_variants = Set()
	for key, group in groupby(bam.fetch(until_eof=True), lambda x: name_sep.join(x.qname.split(name_sep)[:2])):
	    alns = list(group)
	    variant_idx = query_to_variant[key][0]
	    variant = self.variants[variant_idx]
	    adj_idx = query_to_variant[key][1]
	    adj = variant.adjs[adj_idx]
	    adj_aligns = adj.aligns[0]
	    
	    indices_to_check = (0, 1)
	    if variant.event == 'INS':
		index = None
		for i in (0, 1):
		    if variant.chrom == adj.chroms[i] and (variant.pos[0] == adj.breaks[i] or variant.pos[1] == adj.breaks[i]):
			index = i
			break
		    
		if index is not None:
		    indices_to_check = (index,)
	    
	    probe_alns = [aln for aln in alns if not aln.qname[-1].isdigit()]
	    if not gapped_align.screen_probe_alns(adj_aligns, probe_alns, adj.align_types[0]):
		if self.debug:
		    sys.stdout.write('probe align completely to one location or not aligned with confidence: %s\n' % key)
		failed_variants.add(variant)
		continue
	    			
	for failed_var in failed_variants:
	    self.variants.remove(failed_var)
예제 #4
0
    def find_adjs(self, min_ctg_cov, max_size=None, min_size=None, ins_as_ins=False, skip_acen=False, check_alt_paths=False, min_ctg_size=0, bad_coords=None, skip_contigs_file=None):
	"""Main method to go through the BAM file, extract split and gapped alignments, and calls
	the respective modules to identify adjs"""
	def find_events_in_single_align(align):
	    """Implement as sub-function so that small-scale events can be found on split alignments too"""
	    adjs = gapped_align.find_adjs(align, contig_seq, False, ins_as_ins=ins_as_ins,
	                                  query_fasta=self.contig_fasta, target_fasta=self.ref_fasta)
		
	    repeats = Set()
	    for i in range(len(adjs)):
		adj = adjs[i]
		
		if self.skip_simple_repeats and self.break_region_has_low_complexity(adj.chroms[0], adj.breaks):
		    repeats.add(i)
		    if self.debug:
			sys.stdout.write("remove contig %s %s potential simple-repeat %s:%s-%s\n" % (adj.contigs[0], 
			                                                                             adj.rearrangement, 
			                                                                             adj.chroms[0], 
			                                                                             adj.breaks[0], 
			                                                                             adj.breaks[1]))
		    continue
		    
		# seems unnecessary
		#new_contig_breaks = self.expand_contig_breaks(adj.chroms[0], adj.breaks, contig, adj.contig_breaks[0], adj.rearrangement, self.debug)
		#if new_contig_breaks is not None:
		    #adj.contig_breaks[0] = new_contig_breaks
		    			    
	    if repeats:
		for i in sorted(repeats, reverse=True):
		    del adjs[i]
		
	    return adjs	
	
	def is_align_in_acen(align, acen):
	    """Checks to see if alignment overlaps with acentromeric coordinates
	    Args:
	        align: alignment (Alignment)
		acen: acentromeric coordinates parsed from UCSC cytobands file (Dictionary) {chrom:(start, end), (start, end)}
	    Returns True if overlapped
	    """
	    s1, e1 = align.tstart, align.tend
	    if acen.has_key(align.target):
		for (start, end) in acen[align.target]:
		    s2, e2 = int(start) - self.acen_buffer, int(end) + self.acen_buffer
		    if s1 <= e2 and s2 <= e1:
			return True
		    
	    return False
	
	def create_set(list_file):
	    """Creates set from items in a list"""
	    subset = Set()
	    for line in open(list_file, 'r'):
		subset.add(line.strip('\n'))
	    return subset
		    
	acen_coords = None
	if skip_acen:
	    acen_coords = get_acen_coords(self.cytobands_file)
	    
	skip_contigs = None
	if skip_contigs_file and os.path.exists(skip_contigs_file):
	    skip_contigs = create_set(skip_contigs_file)
	
	all_adjs = []
	for contig, group in groupby(self.bam.fetch(until_eof=True), lambda x: x.qname):
	    print 'contig', contig
	    alns = list(group)
	    contig_seq = self.contig_fasta.fetch(contig)
	    
	    if len(contig_seq) < min_ctg_size:
		if self.debug:
		    sys.stdout.write('%s(%d bp) less than min contig size %d bp\n' % (contig, len(contig_seq), min_ctg_size))
		continue
	    
	    if skip_contigs and contig in skip_contigs:
		if self.debug:
		    sys.stdout.write('%s skipped\n' % contig)
		continue
	    
	    if len(alns) > 1:
		chimeric_aligns, dubious = split_align.find_chimera(alns, 
		                                                    self.bam, 
		                                                    min_coverage=min_ctg_cov, 
		                                                    check_alt_paths=check_alt_paths, 
		                                                    debug=self.debug)		
		if chimeric_aligns:
		    if acen_coords:
			skip = False
			for align in chimeric_aligns:
			    if acen_coords and is_align_in_acen(align, acen_coords):
				if self.debug:
				    sys.stdout.write('skip contig %s because alignment is in centromere %s:%d-%d\n' % (contig,
					                                                                               align.target,
					                                                                               align.tstart,
					                                                                               align.tend
					                                                                               ))
				skip = True
				break
			if skip:
			    continue
		
		    adjs = split_align.find_adjs(chimeric_aligns, contig_seq, dubious=dubious, debug=self.debug)
		    
		    bad = Set()
		    for i in range(len(adjs)):
			adj = adjs[i]
			#check if homol is simple repeat
			if adj.homol_seq and adj.homol_seq[0] != '-' and self.is_homol_low_complexity(adj):
			    if self.debug:
				sys.stdout.write("homol_seq is simple-repeat %s:%s\n" % (adj.contigs[0], adj.homol_seq[0]))
			    bad.add(i)
			
			# check if event is simple repeat expansions
			if self.skip_simple_repeats and self.is_novel_sequence_repeat(adj):
			    if self.debug:
				sys.stdout.write("novel_seq is simple-repeat %s:%s\n" % (adj.contigs[0], adj.novel_seq))
			    bad.add(i)
			    
			# inversion with size of 1
			if adj.rearrangement == 'inv' and adj.get_size() <= 1:
			    if self.debug:
				sys.stdout.write("inversion with unreasonable size %s:%d %s:%d-%d\n" % (adj.contigs[0], adj.get_size(), 
				                                                                        adj.chroms[0], adj.breaks[0], adj.breaks[1]))
				bad.add(i)
			    
			if i > 0:
			    if adjs[i].chroms == adjs[i - 1].chroms and\
			       adjs[i].breaks == adjs[i - 1].breaks and\
			       adjs[i].orients == adjs[i].orients and\
			       adjs[i].contig_breaks != adjs[i - 1].contig_breaks:
				if self.debug:
				    sys.stdout.write("%s has 2 contig_breaks for same event\n" % adj.contigs[0])
				bad.add(i - 1)
				bad.add(i)
		    
		    if bad:
			for i in sorted(bad, reverse=True):
			    del adjs[i]
			    
		    all_adjs.extend(adjs)
		   
		    # capture small-scale events within each chimeric alignment
		    for align in chimeric_aligns:
			all_adjs.extend(find_events_in_single_align(align))
			
	    best_align = gapped_align.find_single_unique(alns, self.bam, debug=self.debug)
	    if best_align:
		all_adjs.extend(find_events_in_single_align(best_align))
		    
	merged_adjs = Adjacency.merge(all_adjs)
	
	# screen out adjacencies that overlap segdups
	if bad_coords is not None and os.path.exists(bad_coords):
	    self.screen_by_coordinate(merged_adjs, bad_coords)
	    
	# size filtering
	if max_size is not None or min_size is not None:
	    selected = []
	    for adj in merged_adjs:
		size = adj.get_size()
		
		if max_size is not None and\
		   min_size is not None:
		    if type(size) is int and\
		       size >= min_size and size <= max_size:
			selected.append(adj)

		elif max_size is not None:
		    if type(size) is int and\
		       size <= max_size:
			selected.append(adj)
			
		elif min_size is not None:
		    if type(size) is not int or\
		       size >= min_size:
			selected.append(adj)			
			
	    return selected
	else:
	    return merged_adjs