Пример #1
0
def parse1KGvcf(vcffile, plines, genotypedboutput, refdboutput, altdboutput):
	'''
	parse1KGvcf('../1000GenomesData/CEU.low_coverage.2010_09.genotypes.vcf' , p1lines, 'testoutput', 'testoutputRef', 'testoutputAlt')

	DEFAULT: only take Illumina 2.5M snps from genotype, assume they are consistant enough
	'''

	#>>> for l in lines:
	#	newg.write(l.split('\t')[0]+','+l.split('\t')[1])
	afile = open('mark/arraypool/25M1.1', 'r')
	lines = afile.readlines()
	afile.close()
	asnps = set([])
	for l in lines:
		asnps.add(l.split('\t')[9] + ':' + l.split('\t')[10])

	print len(asnps)

	vfile = open(vcffile, 'r')
	vcf_reader = vcf.Reader(vfile)
	
	poollines = gl.jsonload(plines)
	outputfile = open(genotypedboutput, 'w')
	ref = {}
	alt = {}
	for record in vcf_reader:
		try:
			chrom = record.INFO['GP'].split(':')[0]
			pos = record.INFO['GP'].split(':')[1]	
		except KeyError:
			continue
		if chrom+':'+pos in asnps:
			ref[chrom+':'+pos] = str(record.REF) 
			alt[chrom+':'+pos] = str(record.ALT[0])
			m = chrom+':'+pos+','
			sm = 0
			for s in poollines:
				record.genotype(s)
				try:				
					geno = record.genotype(s)['GT']
				except KeyError:
					print "no genotype"
					continue
				if geno:
					if '|' in geno:
						g = geno.split('|')
					if '\\' in geno:
					 	g = geno.split('\\')
					if '/' in geno:
						g = geno.split('/')
					an = int(g[0]) + int(g[1])
					sm = sm + an
					m = m + str(an) + ','
				else:
					m = m+str(0) + ','
			if sm > 0:
				outputfile.write(m.strip(',') + '\n')

	gl.jsondump(ref, refdboutput)
	gl.jsondump(alt, altdboutput)
Пример #2
0
def getarraysnps(report, fgenoref, fgenoalt, outname, kwargs):
	'''
	test this with MKReport1bysnps.txt and output of parse1KGvcf function
	getarraysnps('MKReport1bysnps.txt', 'testoutputRef', 'testoutputAlt')
	In this version we totally ignore compliments

	Array can't have duplicates (it does). We take the last one mentioned in the array 
	report, because those are the 1kg ones rather than rsid

	kwargs:
	snp
	chr
	pos
	theta
	header
	'''

	print report
	print kwargs
	file = open(report)
	lines = file.readlines()
	lines.reverse()
	file.close()

	genoref = gl.jsonload(fgenoref)  
	genoalt = gl.jsonload(fgenoalt)
	output = open(outname, 'w')

	try:
		header = kwargs['header']
		h = header.split('\\t')
		print h
		snpi = h.index("SNP")
		chri = h.index("Chr")
		posi = h.index("Position")
		thetai = h.index("Theta")
	except KeyError:
		try:
			snpi = int(kwargs['snp'])
			chri = int(kwargs['chr'])
			posi = int(kwargs['pos'])
			thetai = int(kwargs['theta'])
		except KeyError:
			print "No header or column numbers provided provided"

	snps = set([])
	freq = {}
	for l in lines:
		t = l.split('\t')
		try:
			if t[chri] not in map(lambda x: str(x), range(1,23)):
				continue
			else:
				snppos = t[chri]+':'+t[posi]
				ref = t[snpi].split('/')[0][1] 
				alt = t[snpi].split('/')[1][0]
				if genoref[snppos] == ref and genoalt[snppos] == alt:
					f = float(t[thetai])
				elif genoref[snppos] == alt and genoalt[snppos] == ref:
					 f = 1 - float(t[thetai])
				else:
					continue
				if f != 0 and f != 1:
					#freq[snppos] = f
					if snppos not in snps:
						snps.add(snppos)
						output.write(snppos + '\t' + str(f) + '\n')
					else:
						continue
		except:
			continue