예제 #1
0
def checkRef(name):
	reffile = name +'RefT'
	altfile = name + 'AltT'
	hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19(download = True)
	print "Loaded hg19"
	ref = glob.json(reffile)
	print "Loaded Ref"
	alt = glob.json(altfile)
	print "Loaded Alt"
	flip = []
	errors = []
	keys = ref.keys()
	for snppos in keys:
		print snppos + '\t' + name
		t = snppos.split('pos')
		hg19snp = str(hg19[t[0]][int(t[1])-1]).upper()
		refsnp = ref[snppos].upper()
		altsnp = alt[snppos].upper()
		if hg19snp == refsnp:
			continue
		elif hg19snp == altsnp:
			flip.append(snppos)
		else:
			print "Error: Neither Ref nor Alt of SNP corresponds to hg19 sequence"
			errors.append(snppos)
	glob.dump(flip, name+'flips')
	glob.dump(errors, name+'errors')
	return [flip, errors]
예제 #2
0
def main(argv):
	opts, args = getopt.getopt(argv,"a:ghc:i",["report=", "genonames="])
	for opt, arg in opts:
		if opt == '-a':
			report = arg
			print "processing array, getting snps {0}".format(report)
			getarraysnps(arg)
			[f, e] = parsegenotypes.checkRef(report)
			parsegenotypes.flipArray(report, f)
			parsegenotypes.filterzeros(report)
			parsegenotypes.printtabarray(report)
			
		elif opt == '-g':
			processgenotypes()
		elif opt == '-h':
			processhapmap()
		elif opt == '-c':
			snps = glob.json('Array251Msnps')
			if arg == '':
				genofiles = names1KG
			else:
				genofiles = arg
			combinegenos(genofiles, snps)
			
		elif opt == "-i":
			genofiles = names.append('hapmap')
예제 #3
0
def getpoollines(genofile, pool, out = "poolgenotype"):
	output = open(out, 'w')
	
	pool = glob.json(pool)
	g = open(genofile)
	lines = g.readlines()
	g.close()
	
	linenames = lines[0]
	ln = linenames.split('\t')[1].split(',')
	ln = map(lambda x: x.strip('\n'), ln)
	print pool
	poolinds = map(lambda x: ln.index(x), pool)
	
	nl = [ln[i] for i in poolinds]
	newlinenames = ','
	newlinenames = reduce(lambda x,y: x + ',' + y, nl)
	output.write(newlinenames + '\n')
	
	for l in lines[1:]:
		t = l.split('\t')
		
		gs = t[1].split(',')
		gs = map(lambda x: x.strip('\n'), gs)
		if len(filter(lambda x: int(x)!=0, gs)) >0:
			newl = t[0] + ','
			newg = []
			for i in range(0, len(gs)):
				if i in poolinds:
					newg.append(gs[i])
			newg = reduce(lambda x,y: x + ',' + y, newg)
			newl = newl + newg + '\n'
			output.write(newl)		
예제 #4
0
def printtabarray(arrayname):
	"""output will be analyzed by R to find cell line frequencies
	"""
	
	output = open(arrayname+'Rinput', 'w')
	freq = glob.json(arrayname+'freq')
	for snp in freq.keys():
		output.write(snp + '\t')
		output.write(str(freq[snp]) + '\n') 
예제 #5
0
def filterzeros(arrayname):
	"""take out those that are 0
		
	"""
	
	freq = glob.json(arrayname+'freq')
	for snp in freq.keys():
		if freq[snp] == 0 or math.isnan(freq[snp]):
			del freq[snp]
	glob.dump(freq, arrayname+'freq')
def parsehapmapchrom(chrom):
	""" inputs: rsid hash (hash of snp positions and rsIDs)
				hapmapchrN downloaded from hapmap3 site
				people: list of cell line IDs that we are looking for
	"""
	
	hapmapfile = '../genotypes/hapmapchr'+str(chrom)
	if 'rsid2poshash' not in os.listdir('./'):
		makerhash()
	else:
		rshash = glob.json('rsid2poshash')	
	people = ['NA19140','NA19154','NA19173','NA19203','NA19206','NA19211','NA19222']
		
	with open(hapmapfile) as f:
		lines = f.readlines()	
	header = lines[0].split(' ')
	
	rsidi = header.index('rs#')
	snpi = header.index('alleles')
	
	peoplewithgenos = filter(lambda x: x in header, people)
	print peoplewithgenos
	peoplei = map(lambda x: header.index(x), peoplewithgenos)
	
	refhash = {}
	althash = {}
	out = open(hapmapfile+'genotype','w')
	
	for l in lines[1:]:
		t = l.split(' ')
		rsid = t[rsidi]
		snps = t[snpi].split('/')
		ref = snps[0].upper()
		alt = snps[1].upper()
				
		genotypes = map(lambda x: t[x], peoplei)
		genocount = map(lambda x: len(filter(lambda y: alt == y, x.upper())), genotypes)
		try:
			snppos = rshash[rsid]
			newline = snppos + '\t'
			newline = reduce(lambda x,y: x+str(y) + ',', [newline] + genocount)
			out.write(newline+'\n')

			refhash[snppos] = ref
			althash[snppos] = alt
		except KeyError:
			pass
	out.close()
	return [refhash, althash]
예제 #7
0
def flipArray(arrayname, flip, error):
	"""flip array snp frequencies (hash)
	1-freq for those in snp list inputed as flip
	input is constructed in the original getarraysnps() function
	""" 
	
	try:
		arrayfreq = glob.json(arrayname+'freq')
	except:
		"No array snp frequency file"
	for snp in flip:
		arrayfreq[snp] = 1 - arrayfreq[snp]
	for snp in error:
		del arrayfreq[snp]
	glob.dump(arrayfreq, arrayname+'freq')
예제 #8
0
def filterSNPs(name):
	reffile = name +'Ref'
	altfile = name + 'Alt'
	[ref, alt] = map(lambda x: glob.json(x, ''), [reffile, altfile])
	print "Loaded Ref {0}, Alt {1}".format(len(ref),len(alt))
	keys = ref.keys()
	complsnps = []
	for snppos in keys:
		if glob.compl[ref[snppos].upper()] == alt[snppos].upper() or ref[snppos].upper() == alt[snppos].upper():
			complsnps.append(snppos)
			del ref[snppos]
			del alt[snppos]

	print len(ref)
	print len(alt)
	glob.dump(ref, reffile+'T')
	glob.dump(alt, altfile+'T')
	return complsnps
예제 #9
0
	
	if args.a:
		print "processing array, getting snps {0}".format(args.a)
		report = args.a
		getarraysnps(report)
		[f, e] = parsegenotypes.checkRef(report)
		parsegenotypes.flipArray(report, f, e)
		parsegenotypes.filterzeros(report)
		parsegenotypes.printtabarray(report)
	if args.g:
		processgenotypes()
	if args.hapmap:
		processhapmap()
	if args.init1KG:
		#snps = glob.json('25M1.1snps')
		snps = glob.json('MKReportbySNP1.txtsnps')
		#combinegenos(names1KG, snps, 'Genos1kgArray25M')
		combinegenos(names1KG, snps, 'Genos1kgArrayOmni')

	if args.inithapmap:
		snps = glob.json('Array25M1snps')
		combinegenos('hapmap', snps, 'hapmapGenosArray25M', 1)
		
	if args.pool:
		#getpoollines('intercomb','pool1', 'pool1genotype')
		
		# new arrays 
		#getpoollines('Genos1kgArray25M', 'pool1', 'pool1genotype')

		#old arrays
		getpoollines('Genos1kgArrayOmni', 'pool1', 'pool1genotypeOmni')