def processPhasedData (vcf_filename, chromosome):
	[hit_snp_positions, hit_snps] = readHitSNPs(chromosome)
	genes = hg19.read(chromosome)
	indices_children = vcf.returnColumnsOfChildren(vcf_filename)	
	deletions = vcf.returnDeletions(chromosome, indices_children, MAJOR_AF_THRESHOLD)
	vcf_file = vcf.openVCFFile(vcf_filename)
	vcf.discardVCFHeaders(vcf_file)
	vcf_file.readline() # discard the column description header

	output = [] # output is stored here
	
	for variant in vcf_file.readlines():
		data = [value for value in variant.split()]
		if len(data[3]) == 1: # variant is a SNP
			# then determine whether this is a hit snp:
			snp_pos = int(data[1])
			#if snp_pos in hit_snp_positions: # hit snp
			#	[phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_children)
			#	if snp_af <= MAJOR_AF_THRESHOLD and 1 - snp_af <= MAJOR_AF_THRESHOLD:
			#		snp_type 	= hg19.determineType(snp_pos, genes)
			#		dist_tss 	= hg19.distanceToTSS(snp_pos, genes)		
			#		[max_r, p, d] 	= maximumR_hitsnp (snp_pos, phase, deletions)
			#		#print '1', max_r, p
			#		if d is not None:
			#			dist_snp_del = distanceSNP_CNV(snp_pos, d['pos'], d['length']) 
			#			output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss])
			#else: # non-hit snp
			phase 	= vcf.returnPhase(data, indices_children) 
			snp_af 	= vcf.determineAlleleFrequency (phase)
			if snp_af <= MAJOR_AF_THRESHOLD and 1 - snp_af <= MAJOR_AF_THRESHOLD:
				snp_pos 	= int(data[1])
				snp_type 	= hg19.determineType(snp_pos, genes)
				dist_tss 	= hg19.distanceToTSS(snp_pos, genes)		
				[max_r, p, d] 	= maximumR (snp_pos, phase, deletions)
				type_snp = 0
				if snp_pos in hit_snp_positions:
					type_snp = 1 
				#print '0', max_r, p
				if d is not None:
					dist_snp_del = distanceSNP_CNV(snp_pos, d['pos'], d['length']) 
					output.append([chromosome, type_snp, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss])
	writeToOutputFile(output, chromosome)
	vcf_file.close()	
def preprocessData (vcf_filename, chromosome, individuals=('parents', 'children')):
	[hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions] = gatherData(vcf_filename, chromosome, individuals)	
	
	vcf_file = vcf.openVCFFile(vcf_filename)
	vcf.discardVCFHeaders(vcf_file)
	vcf_file.readline() # discard the column description header

	output = [] # results are stored here and later written to file
	n_hit_snps, n_discarded_snps, n_nonhit_snps = 0, 0, 0 

	for variant in vcf_file.readlines (): # move through the phased variants in this file
		data = [value for value in variant.split()]
		if len(data[3]) == 1: # variant is a SNP and no deletion
			snp_pos = int(data[1])
			snp_type = hg19.determineType(snp_pos, genes)
			dist_tss = hg19.distanceToTSS(snp_pos, genes)
			if snp_pos in hit_snps_positions: # SNP is a hit SNP
				[phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_individuals)
				[max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions)	
				if d is not None:
					[g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af'])
					[a,b,c,e] = vcf.return2x2Table(phase, d['phase']) 
					n_hit_snps = n_hit_snps + 1
					output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e])		
				else:
					n_discarded_snps = n_discarded_snps + 1
					discarded_snps_positions.append(snp_pos) 
			elif snp_pos not in discarded_snps_positions: # non-hit SNP 
				phase 	= vcf.returnPhase(data, indices_individuals) 
				snp_af 	= vcf.determineAlleleFrequency (phase)
				[max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions)	
				if d is not None:
					[g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af'])
					[a,b,c,e] = vcf.return2x2Table(phase, d['phase']) 
					n_nonhit_snps = n_nonhit_snps + 1
					output.append([chromosome, 0, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e])		
				else:
					n_discarded_snps = n_discarded_snps + 1
					discarded_snps_positions.append(snp_pos)  
	writeToOutputFile(output, chromosome)
	vcf_file.close()	
	print '# hit SNPs: ', n_hit_snps, '\t#discarded SNPs:', n_discarded_snps, '\t# non hit SNPs: ', n_nonhit_snps
示例#3
0
def processSNPs (vcf_filename, chromosome, individuals=('parents', 'children')):
	# gather the required data
	[hit_snps_positions, hit_snps, discarded_snps_positions] = pr.readHitSNPs (chromosome)
	genes = hg19.read (chromosome)
	indices_individuals = vcf.returnColumns (vcf_filename, individuals) 
	deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD)
	# open file	
	vcf_file = vcf.openVCFFile(vcf_filename)
	vcf.discardVCFHeaders(vcf_file)
	vcf_file.readline() # discard the column description header
	
	for variant in vcf_file.readlines(): # move through the genetic variants
		snp_output = [] 
		data = [value for value in variant.split()]
		if len(data[3]) == 1: # variant is a SNP and no deletion
			snp_pos = int(data[1])
			snp_type = hg19.determineType(snp_pos, genes)
			dist_tss = hg19.distanceToTSS(snp_pos, genes)	
			if snp_pos in hit_snps_positions: # SNP is a hit SNP
				[phase, snp_af] = pr.processHitSNP (snp_pos, data, hit_snps, indices_individuals)
				if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD:
					snp_output.append(['HITSNP', snp_pos, snp_af, snp_type, dist_tss]) 
					dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement
					if len(dels) > 0:
						snp_output.append(dels)
			elif snp_pos not in discarded_snps_positions:
				phase 	= vcf.returnPhase(data, indices_individuals) 
				snp_af 	= vcf.determineAlleleFrequency (phase)
				if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD:				
					snp_output.append(['NONHITSNP', snp_pos, snp_af, snp_type, dist_tss]) 
					dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement
					if len(dels) > 0:
						snp_output.append(dels)
			if len(snp_output) > 0:
				writeSNPToOutputFile(snp_output, chromosome)
	vcf_file.close()
def processHitSNP (snp_pos, data, hit_snps, indices_individuals):
	for snp in hit_snps:
		if snp['pos'] == snp_pos:
			phase = vcf.returnPhaseWithAllele (data, indices_individuals, snp['hit_allele']) 
			snp_af = vcf.determineAlleleFrequency(phase)
			return [phase, snp_af]