Exemplo n.º 1
0
def permer(perm,randind):
		# First we permute the genotypes and write them out to a file (bimbam format)
		random.seed(perm + (int(blocknum)*100))
		shuffle(randind)
		snper = currfiles + '.snps'
		aer = currfiles + '.as'
		ger = currfiles + '.gs'
		shuffler = 'zcat ' + genodir + 'ByChr/*.all*.gz | grep -f ' + snper + ' - | cut -f' + ','.join(randind) + ' > ' + currfiles + '_perm_sub.bimbam; paste ' + snper + ' ' + aer + ' ' + ger + ' ' + currfiles + '_perm_sub.bimbam > ' + currfiles + '_perm.bimbam'
		ifier(shuffler)
		# This runs gemma on the permuted genotypes
		gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '_perm.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covariates.txt' + ' -lmm 4 -maf 0.05 -o ' + blocknum + '.' + pheno)
		ifier(gemmer)
def permer(perm):
	#for perm in xrange(0,100):
	#	if actives.count(0) == 0:
	#		continue
		# First we permute the genotypes and write them out to a file (bimbam format)
		random.seed(perm + (int(blocknum)*100))
		shuffle(randind)
		updateind = [0,1,2] + randind
		permbimbam = open(genodir + 'perms/'  + blocknum + '_perm_curr.bimbam','w')
		for snp in masterdic[gene]:
			yrand = y[snp][updateind]
			print >> permbimbam, ", ".join(yrand)
		permbimbam.close()
		# This runs gemma on the permuted genotypes
		gemmer = (hmdir + 'Programs/gemma0.94 -g ' + genodir + 'perms/' + blocknum + '_perm_curr.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covs.txt' + ' -lmm 4 -maf 0.05 -o perm_curr_' + blocknum)
		ifier(gemmer)
Exemplo n.º 3
0
def permer(gene):
	winnerperms = 0
	for perm in xrange(0,10000):
		shuffle(randind)
		updateind = [0,1,2] + randind
		permgenos = [", ".join([genodic[x][index] for index in updateind]) for x in masterdic[gene]]
		currbimbam = open(genodir + 'perm_curr_' + chrm + '_pc' + str(pcs) + '.bimbam','w')
		print >> currbimbam, "\n".join(permgenos)
		currbimbam.close()
		print 'Gene No. ' + str(len(winnerdic.keys()) + 1) + ' on chrm.'
		print str(winnerperms) + ' of ' + str(perm) + ' permutations lost.'
		gemmer = (hmdir + 'Programs/gemma0.94 -g ' + genodir + 'perm_curr_' + chrm + '_pc' + str(pcs) + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.pcs.txt' + ' -lmm 4 -maf 0.05 -o perm_curr_' + chrm + '_pc' + str(pcs))
		ifier(gemmer)
		permering = open(genodir + 'output/perm_curr_' + chrm + '_pc' + str(pcs) + '.assoc.txt','r')
		permers = [x.strip().split()[12] for x in permering.readlines()]
		permers = [float(x) for x in permers if x != 'nan' and x != 'p_lrt']
		permering.close()
		permlow = min(permers)
		if permlow <= pmin:
			winnerperms += 1
		if winnerperms == 10:
			return 11/uniform(perm+2,perm+3)
	return (winnerperms + 1)/float(10001)
Exemplo n.º 4
0
#raws = numpy.loadtxt(genodir + 'ByChr/hutt.imputed.chr' + chrm + '.raw',dtype='str')

print "Transposing genotypes..."
sys.stdout.flush()
genorfile = open(genor,'w')
for line in range(6,raws.shape[1]):
	print >> genorfile, "\t".join(list(raws[1:raws.shape[0],line]))

genorfile.close()

#traws = numpy.transpose(raws)
#trawsu = traws[6:traws.shape[0],1:traws.shape[1]]
#numpy.savetxt(genor,trawsu,delimiter="\t",fmt='%s')

print "Annotating SNPs..."
sys.stdout.flush()
traws1 = numpy.column_stack(([0]*len(snpids),['.']*len(snpids)))
traws2 = numpy.column_stack((snpids,traws1))
traws3 = numpy.column_stack((snppos,traws2))
traws4 = numpy.column_stack(([str(int(x)-1) for x in snppos],traws3))
traws5 = numpy.column_stack((['chr' + str(chrm)]*len(snpids),traws4))
numpy.savetxt(anoter,traws5,delimiter="\t",fmt='%s')

print "Finalizing files..."
sys.stdout.flush()
paster = '/bin/bash -c "paste <(cat ' + anoter + ') <(cat ' + genor + ') > ' + genodir + 'ByChr/' + outname + '.chr' + chrm + '.txt; rm ' + anoter + '; rm ' + genor + '"'
ifier(paster)

doner = open(genodir + 'ByChr/chr' + chrm + '.done','w')
doner.close()
Exemplo n.º 5
0
     chrm = numpy.array(chrm)
     if x == 0:
         genecounting = numpy.column_stack((numpy.array(chrm)[:,numpy.newaxis],numpy.array(start)[:,numpy.newaxis],
                                            numpy.array(end)[:,numpy.newaxis],numpy.array(gene)[:,numpy.newaxis],
                                            numpy.array(length)[:,numpy.newaxis],numpy.array(counts)[:,numpy.newaxis],))
     else:
         if z == 0:
             genecounting = numpy.array(counts)[:,numpy.newaxis]
             z = 1
         else:
             genecounting = numpy.column_stack((genecounting,numpy.array(counts)[:,numpy.newaxis]))
     x = 1
 #Code to remove corrupted exoncounts files and regenerate...
 except IOError:
     print 'Fixing ' + samp + '...'
     ifier('rm ' + sample + '.exoncounts.txt')
     cleanup = '/mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.quality.merged.bam ' + sample + '.quality.sort.bam ' + sample + '.saved.quality.sort.bam; \
                /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/bamToBed -i ' + sample + '.quality.merged.bam > ' + sample + '.bed; \
                /mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.junction.quality.merged.bam ' + sample + '.junction.quality.sort.bam ' + sample + '.saved.junction.quality.sort.bam; \
                /mnt/lustre/home/cusanovich/Programs/samtools/samtools view ' + sample + '.junction.quality.merged.bam > ' + sample + '.junction.quality.merged.sam; \
                python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/junctionreformatter.py ' + sample + '.junction.quality.merged.sam ' + sample + '.junction.bed; \
                cat ' + sample + '.junction.bed >> ' + sample + '.bed; \
                /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/coverageBed -a ' + sample + '.bed -b /mnt/lustre/data/users/cusanovich/References/hg19ProteinCodingEnsemblExonsMergedNonoverlapping.bed > ' + sample + '.exoncounts.txt; \
                python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/exoncombiner.py ' + sample + '.exoncounts.txt ' + sample + '.genecounts.txt; \
                rm ' + sample + '.quality.merged.bam; \
                rm ' + sample + '.junction.quality.merged.bam; \
                rm ' + sample + '.junction.quality.merged.sam; \
                rm ' + sample + '.junction.bed; \
                python 500HT/Scripts/RNAseq/exonmatrixmaker.py;'
     ifier(cleanup)
     print samp + ' fixed.'
Exemplo n.º 6
0
#step 2 - chrom raws
#step 3 - chrom bim (snpid + position)
#step 4 - load raw
#step 5 - transpose raw
#step 6 - add chrom, start, end, snpid, "0", "."
#step 7 - save table
#step 8 - compress with bgzip
#step 9 - index with tabix

genodir = '/mnt/lustre/home/cusanovich/500HT/Imputed1415/'
outname = 'hutt.all.imputed'
print 'Creating raw files...'
for j in range(1,23):
	#plinker = 'echo "plink --noweb --nonfounders --maf 0.05 --geno 0.05 --bfile ' + genodir + 'imputed_cgi --chr ' + str(j) + ' --make-bed --out ' + genodir + 'ByChr/hutt.imputed.chr' + str(j) + '; plink --bfile ' + genodir + 'ByChr/hutt.imputed.chr' + str(j) + ' --recodeA --out ' + genodir + 'ByChr/hutt.imputed.chr' + str(j) + '; touch ' + genodir + 'ByChr/chr' + str(j) + '.done" | qsub -l h_vmem=2g -o ~/dump/ -e ~/dump/'
	plinker = 'echo "plink --noweb --nonfounders --bfile ' + genodir + 'hutt.imputed.rename --chr ' + str(j) + ' --make-bed --out ' + genodir + 'ByChr/' + outname + '.chr' + str(j) + '; plink --bfile ' + genodir + 'ByChr/' + outname + '.chr' + str(j) + ' --recodeA --out ' + genodir + 'ByChr/' + outname + '.chr' + str(j) + '; touch ' + genodir + 'ByChr/chr' + str(j) + '.done" | qsub -l h_vmem=2g -o ~/dump/ -e ~/dump/'
	ifier(plinker)

while len(glob.glob(genodir + 'ByChr/*.done')) < 22:
	time.sleep(5)

cleanup = "rm " + genodir + "ByChr/*.done"
ifier(cleanup)

print 'Creating bed files...'
for j in range(1,23):
	converter = 'echo "python /mnt/lustre/home/cusanovich/500HT/Scripts/raw2txt.py ' + str(j) + ' ' + genodir + ' ' + outname + '" | qsub -l h_vmem=8g -o ~/dump/ -e ~/dump/'
	ifier(converter)

while len(glob.glob(genodir + 'ByChr/*.done')) < 22:
	time.sleep(5)
	tabixer = Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.all.imputed.' + chrm + '.txt.gz')
	tempgenos = [x.split('\t') for x in tabixer.fetch(chrm,int(masterdic[snp][1])-1,int(masterdic[snp][2]))][0]
	genos = [tempgenos[x] for x in range(0,6) + genoinds]
	tabixer.close()
	y[snp] = [genos[3], 'A', 'G'] + genos[6:]
	print >> currbimbam, ", ".join(y)

#t1 = time.time()
#print t1-t0
currbimbam.close()

#genomat = matrix_reader(genodir + 'hutt.imputed.dhssnps.bimbam',sep=",")
print "Running GEMMA..."
gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covariates -lmm 4 -maf 0.05 -o curr_' + pheno)
t0 = time.time()
ifier(gemmer)
t1 = time.time()
print t1-t0
#currresults = open(genodir + 'output/curr_' + pheno + '.assoc.txt','r')
currresults = matrix_reader(genodir + 'output/curr_' + pheno + '.assoc.txt',dtype='f8')
currsort = curresults[currresults[:,12].argsort()]
currwins = currsort[0:100,]
currscores = [0]*len(dhsdic[dhsdic.keys()[0]])
for snp in currwins[:,1]:
	currscores = currscores + dhsdic[snp]
currperms = [0]*len(dhsdic[dhsdic.keys()[0]])
currpermwins = [0]*len(dhsdic[dhsdic.keys()[0]])
curractive = [0]*len(dhsdic[dhsdic.keys()[0]])

print "Running permutations..."
for perm in xrange(0,100):
Exemplo n.º 8
0
		masterdic[mastercols[i,0]].append(mastercols[i,1])
	except KeyError:
		masterdic[mastercols[i,0]] = [mastercols[i,1]]
		exprcoldic[mastercols[i,0]] = mastercols[i,2]
		chrmdic[mastercols[i,0]] = mastercols[i,4]

####Build a dictionary to reference the genomic coordinates of each SNP
print "Loading SNP annotations..."
snpdic = {}
snpbed = open('/mnt/lustre/home/cusanovich/500HT/hutt.imputed.coord.bed','r')
for line in snpbed:
	liner = line.strip().split()
	snpdic[liner[3]] = liner[0:3]

cover = ('cp ' + hmdir + '500HT/addSNP.500ht.ordered.square.txt ' + currfiles + '.square.txt')
ifier(cover)

if regressPCs:
	expressed = matrix_reader(hmdir + '500HT/qqnorm.500ht' + gccor + covcor + '.ordered.bimbam',dtype='float')
	pcmat = matrix_reader(hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor + '.ordered.pc' + str(pcs),dtype='float')

if not regressPCs:
	cover = ('cp ' + hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor + '.ordered.pc' + str(pcs) + ' ' + currfiles + '.pcs.txt')
	ifier(cover)

if regressPCs and int(pcs) != 0:
	mod1 = pcmat
	mod2 = mod1.T
	Y = expressed.T
	W = pcmat[:,1:(int(pcs)+1)]
	mods = mod2.dot(mod1)
Exemplo n.º 9
0
                        numpy.array(length)[:, numpy.newaxis],
                        numpy.array(counts)[:, numpy.newaxis],
                    ))
                else:
                    if z == 0:
                        genecounting = numpy.array(counts)[:, numpy.newaxis]
                        z = 1
                    else:
                        genecounting = numpy.column_stack(
                            (genecounting, numpy.array(counts)[:,
                                                               numpy.newaxis]))
                x = 1
            #Code to remove corrupted exoncounts files and regenerate...
            except IOError:
                print 'Fixing ' + samp + '...'
                ifier('rm ' + sample + '.exoncounts.txt')
                cleanup = '/mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.quality.merged.bam ' + sample + '.quality.sort.bam ' + sample + '.saved.quality.sort.bam; \
                           /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/bamToBed -i ' + sample + '.quality.merged.bam > ' + sample + '.bed; \
                           /mnt/lustre/home/cusanovich/Programs/samtools/samtools merge ' + sample + '.junction.quality.merged.bam ' + sample + '.junction.quality.sort.bam ' + sample + '.saved.junction.quality.sort.bam; \
                           /mnt/lustre/home/cusanovich/Programs/samtools/samtools view ' + sample + '.junction.quality.merged.bam > ' + sample + '.junction.quality.merged.sam; \
                           python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/junctionreformatter.py ' + sample + '.junction.quality.merged.sam ' + sample + '.junction.bed; \
                           cat ' + sample + '.junction.bed >> ' + sample + '.bed; \
                           /mnt/lustre/home/cusanovich/Programs/BEDTools/bin/coverageBed -a ' + sample + '.bed -b /mnt/lustre/data/users/cusanovich/References/hg19ProteinCodingEnsemblExonsMergedNonoverlapping.bed > ' + sample + '.exoncounts.txt; \
                           python /mnt/lustre/data/users/cusanovich/RNAseq_scripts/exoncombiner.py ' + sample + '.exoncounts.txt ' + sample + '.genecounts.txt; \
                           rm ' + sample + '.quality.merged.bam; \
                           rm ' + sample + '.junction.quality.merged.bam; \
                           rm ' + sample + '.junction.quality.merged.sam; \
                           rm ' + sample + '.junction.bed; \
                           python 500HT/Scripts/RNAseq/exonmatrixmaker.py;'

                ifier(cleanup)
Exemplo n.º 10
0
for snp in masterdic.keys():
#for snp in masterdic.keys()[0:1000]:
	print >> snplist, snp
	print >> alist, 'A'
	print >> glist, 'G'

snplist.close()
alist.close()
glist.close()

print "Running permutations..."
blocker = open(genodir + 'Block_' + blocknum + 'permwins.txt','w')
for perm in xrange(0,100):
	permer(perm,randind = genoinds)
	permresults = matrix_reader(genodir + 'output/perm_curr_' + blocknum + '.assoc.txt',dtype='f8')
	permsort = permresults[permresults[:,12].argsort()]
	permwins = permsort[0:100,]
	print >> blocker, '\t'.join(permwins)

blocker.close()
 
cleanup = 'rm ' + genodir + '*curr_' + pheno + '.*'
ifier(cleanup)

print "Writing results..."
aller = open('/mnt/lustre/home/cusanovich/500HT/Tissues/' + pheno + '.enrichmentps.txt','w')
for i in xrange(0,len(tissueps)):
	print >> aller, '{0}\t{2:.4g}'.format(dhsdic['rsID'][i],tissueps[i])

aller.close()
Exemplo n.º 11
0
#raws = numpy.loadtxt(genodir + 'ByChr/hutt.imputed.chr' + chrm + '.raw',dtype='str')

print "Transposing genotypes..."
sys.stdout.flush()
genorfile = open(genor, 'w')
for line in range(6, raws.shape[1]):
    print >> genorfile, "\t".join(list(raws[1:raws.shape[0], line]))

genorfile.close()

#traws = numpy.transpose(raws)
#trawsu = traws[6:traws.shape[0],1:traws.shape[1]]
#numpy.savetxt(genor,trawsu,delimiter="\t",fmt='%s')

print "Annotating SNPs..."
sys.stdout.flush()
traws1 = numpy.column_stack(([0] * len(snpids), ['.'] * len(snpids)))
traws2 = numpy.column_stack((snpids, traws1))
traws3 = numpy.column_stack((snppos, traws2))
traws4 = numpy.column_stack(([str(int(x) - 1) for x in snppos], traws3))
traws5 = numpy.column_stack((['chr' + str(chrm)] * len(snpids), traws4))
numpy.savetxt(anoter, traws5, delimiter="\t", fmt='%s')

print "Finalizing files..."
sys.stdout.flush()
paster = '/bin/bash -c "paste <(cat ' + anoter + ') <(cat ' + genor + ') > ' + genodir + 'ByChr/' + outname + '.chr' + chrm + '.txt; rm ' + anoter + '; rm ' + genor + '"'
ifier(paster)

doner = open(genodir + 'ByChr/chr' + chrm + '.done', 'w')
doner.close()