예제 #1
0
def snp_parser(in_yaml,outfile_parsed,vcf_files,bam_file_list,N,IC,min_coverage,min_samples,filter_complex=True):
	print in_yaml,outfile_parsed,vcf_files,bam_file_list,N,IC,min_coverage,min_samples
	import yaml
	import re
	open(outfile_parsed,'a')
	bases = []
	vcf = yaml.load(open(in_yaml,'r').read())
	samples =  [re.sub('[\D]','',bam) for bam in open(bam_file_list,'r').read().split('\n') if len(re.sub('[\D]','',bam)) > 1]
	for key in vcf.keys(): # for all chromosomes (transcripts)
		linkage = get_linkage(key)
		if len(vcf[key]) > 1 and len(linkage.keys()) >= 1 and len(linkage.keys()[0]) > 1 and (sum([x == '1' for x in linkage.values()]) >= 0.9*len(linkage.values())): # if it has SNPs and 90% the SNPs are under linkage
			for SNP in [vcf[key][0].index(int(SNP)) for SNP in linkage.keys() if linkage[SNP] == '1']:
				if vcf[key][3][SNP][0] in range(int(N)/2-int(IC) ,int(N)/2+int(IC)) and sum([qual >= 30 for qual in quality_filter(key,vcf[key][0][SNP],vcf_files)]) >= 0.9*len(quality_filter(key,vcf[key][0][SNP],vcf_files)):
					if len(vcf[key][3][SNP]) == 1: # if SNP not multiallelic
						if filter_complex == True: # if you have not defined non-parsed outfile, then check that you have +coverage (default 5) in each sampe at alt OR ref allele and all variants are SNPs '''
							coverage = position_coverage(key, vcf[key][0][SNP], samples, bam_file_list) # record the coverage of the position in all samples (in the directory)
							if sum([cov >= int(min_coverage) for cov in coverage.values()]) >= int(min_samples): # if the position is covered by min_coverage in (min_samples) number of samples (i.e. the gene is expressed in (min _samples) number of samples) '''
								open(outfile_parsed,'a').write('%s\t%s\t%s\t%s\t%s\t%s\n' % (key, vcf[key][0][SNP], vcf[key][1][SNP], vcf[key][2][SNP][0], coverage, vcf[key][4][SNP]))										
							else:
								print 'min coverage not reached for all files or the variants are complex!'
								pass
				else:
					pass
		else:
			pass
예제 #2
0
def snp_parser(in_yaml,outfile_parsed,N,IC,min_coverage,min_samples,vcf_files,bam_file_list,linkageFile='linkageValidation.txt'):
	''' Linkage file determines whether all the SNPs that have been observedfor the gene are in linkage over all samples, indicated as 1 on the third field of each row (transcript). 0 means no linkage
	over samples, which might be caused by reads from another transcript mapping to the same GCAT gene model sequence. These should be evited.
	'''
	print 'running with arguments ' + in_yaml,outfile_parsed,vcf_files,bam_file_list,N,IC,min_coverage,min_samples
	import yaml, re
	open(outfile_parsed,'a')
	bases = []
	vcf = yaml.load(open(in_yaml,'r').read())
	samples =  [re.sub('[\D]','',bam) for bam in open(bam_file_list,'r').read().split('\n') if len(re.sub('[\D]','',bam)) > 1] 
	a_os = dict([(s,0) for s in samples])
	r_os = dict([(s,0) for s in samples])
	print r_os
	print a_os
	for key in vcf.keys(): # for all chromosomes (transcripts)
		linkage = get_linkage(key,linkageFile)
		print key
		print linkage
		if len(vcf[key]) > 1 and linkage and len([x for x in linkage.keys() if linkage[x] == '1']) >= 2 and (sum([x == '1' for x in linkage.values()]) >= 0.9*len(linkage.values())): # if it has SNPs and 90% the SNPs are under linkage
			haplo = haplotyper(vcf, key, [x for x in linkage.keys() if linkage[x] == '1'], vcf_files, samples) # haplotype parsing function
			open('haplotype_lists.out','a').write(key+'\t'+str([vcf[key][0][x] for x in haplo[0]])+'\t'+str([vcf[key][0][x] for x in haplo[1]])+'\n')
			alt_samples = dict()
			ref_samples = dict()
			alt_coverage = dict()
			ref_coverage = dict()
			variant_types = dict()
			for SNP_index in haplo[0]: # for each SNP position in haplotype for the 1st positions that share the ALT SNP 
				alt_samples[vcf[key][0][SNP_index]] = [re.sub('[\D]','',sample) for sample in vcf[key][4][SNP_index]]
				ref_samples[vcf[key][0][SNP_index]] = [s for s in samples if s not in alt_samples[vcf[key][0][SNP_index]]]
				alt_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], alt_samples[vcf[key][0][SNP_index]], bam_file_list) # record the coverage of the position in all alt samples 
				ref_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], ref_samples[vcf[key][0][SNP_index]], bam_file_list)  # coverage all ref samples 	
				for s in alt_samples[vcf[key][0][SNP_index]]:
					a_os[s] += alt_coverage[vcf[key][0][SNP_index]][s]
				for s in ref_samples[vcf[key][0][SNP_index]]:
					r_os[s] += ref_coverage[vcf[key][0][SNP_index]][s]
				assert all([s in alt_coverage.keys() for s in alt_samples])
				assert all([s in ref_coverage.keys() for s in ref_samples])
			for SNP_index in haplo[1]: 
				'''
				for each SNP position in haplotype for the 2st positions that share the ALT SNP 
				the samples in the ALT field of the vcf dictionary have to be counted as the the reference to have the coverage of the two maternal haplotypes
				the REF SNP is linked to the ALT SNPs of the positions in the 1st list of "haplo" --> see haplotyper function   
				'''
				ref_samples[vcf[key][0][SNP_index]] = [re.sub('[\D]','',sample) for sample in vcf[key][4][SNP_index]] # these samples actually have the ALT nucleotide relative to the reference! 
				alt_samples[vcf[key][0][SNP_index]] = [s for s in samples if s not in ref_samples[vcf[key][0][SNP_index]]]
				ref_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], ref_samples[vcf[key][0][SNP_index]], bam_file_list) # record the coverage of the position in all alt samples 
				alt_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], alt_samples[vcf[key][0][SNP_index]], bam_file_list)  # coverage all ref samples 	
				for s in ref_samples[vcf[key][0][SNP_index]]:
					a_os[s] += ref_coverage[vcf[key][0][SNP_index]][s] # the "REF" reads cover actually ALT nucleotides, and need to be counted to the alternative observations...
				for s in alt_samples[vcf[key][0][SNP_index]]:
					r_os[s] += alt_coverage[vcf[key][0][SNP_index]][s] # and vice versa!				
				assert all([s in alt_coverage.keys() for s in alt_samples])
				assert all([s in ref_coverage.keys() for s in ref_samples])
			if len(haplo[0]) > 0 or len(haplo[1]) > 0 and  [sum([x >= int(min_coverage) for x in alt_coverage[SNP_index].values()]) for SNP_index in alt_coverage.keys()] +  [sum([x >= int(min_coverage) for x in ref_coverage[SNP_index].values()]) for SNP_index in ref_coverage.keys()] >= int(min_samples): 
				open(outfile_parsed,'a').write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (key, [len(vcf[key][4][SNP]) for SNP in haplo[0] | haplo[1]], [vcf[key][0][SNP] for SNP in haplo[0] | haplo[1]], [vcf[key][2][SNP] for SNP in haplo[0] | haplo[1]], [alt_coverage[vcf[key][0][SNP]] for SNP in haplo[0] | haplo[1]], [ref_coverage[vcf[key][0][SNP]] for SNP in haplo[0] | haplo[1]], alt_samples, ref_samples))
				print 'RUN COMPLETED!!!'										
			else:
				pass
		else:
			pass
	for s in samples:
		open('alt_observations.haplotypes','a').write(str(s) + ' ' + str(a_os[s]) + '\n')
		open('ref_observations.haplotypes','a').write(str(s) + ' ' + str(r_os[s]) + '\n')