def main(args): usage = """python %s <intermediate phenotye> Run a meta analysis of the top-ranked pairs for the different cohorts. Save results in %s/meta_<intermediate phenotype>.sortedpvals. """ % (args[0], rDir) if len(args) == 2: phenotypeName = args[1] else: print usage sys.exit(0) # Get the list of pairs to run threshold = 1e-12 sigPairsList = set([]) for cohort in ['Dutch_MA', 'Dutch_MO', 'Finnish_MA', 'German_MA']: with open("%s/%s_%s.sortedpvals" % (rDir, cohort, phenotypeName)) as f: f.readline() for line in f: ls = line.split() if float(ls[-1]) > threshold: break pair = [ '%s_%s_%s' % (ls[0], ls[1], ls[2]), '%s_%s_%s' % (ls[3], ls[4], ls[5]) ] pair.sort() pair = "/".join(pair) sigPairsList.add(pair) f.close() sigPairsList = list(sigPairsList) # Only keep pairs of SNPs that appear in all studies for cohort in ['Dutch_MA', 'Dutch_MO', 'Finnish_MA', 'German_MA']: f = open('%s/%s/%s_clean_%s.bim' % (dataDirRoot, cohort, cohort, phenotypeName)) if not allSnps: allSnps = set([line.split()[1] for line in f]) else: allSnps = allSnps.intersection(set([line.split()[1] for line in f])) f.close() print len(allSnps), "SNPs appear in all studies" sigPairsList = [x for x in sigPairsList if \ ((x.split("/")[0].split("_")[0] in allSnps) \ and (x.split("/")[1].split("_")[0] in allSnps))] print len(sigPairsList), "pairs to run." # Test pairs for each study sigPairsDict = {pair: [] for pair in sigPairsList} for cohort in ['Dutch_MA', 'Dutch_MO', 'Finnish_MA', 'German_MA']: print "Testing for %s" % cohort dataDir = '%s/%s' % (dataDirRoot, cohort) phenoF = '%s/%s_clean_%s.phenoGlide' % (dataDir, cohort, phenotypeName) bimF = '%s/%s_clean_%s.bim' % (dataDir, cohort, phenotypeName) h5fname = '%s/%s_clean_%s.h5' % (dataDir, cohort, phenotypeName) pheno = np.loadtxt(phenoF) with open(bimF) as f: snpsDict = {('%s_%s_%s' % \ (line.split()[1], line.split()[0], line.split()[3])):idx \ for (idx, line) in enumerate(f)} f.close() with tables.openFile(h5fname) as h5f: for pair in sigPairsList: pairSplit = pair.split("/") snp1idx = snpsDict[pairSplit[0]] snp2idx = snpsDict[pairSplit[1]] snp1x = np.ma.masked_values(h5f.root.genotype[snp1idx], 3) snp2x = np.ma.masked_values(h5f.root.genotype[snp2idx], 3) sigPairsDict[pair].append(testPair(snp1x, snp2x, pheno)) h5f.close() # Combine into a meta-analysis pvalsDict = {} for pair, outputs in sigPairsDict.iteritems(): betas = np.array([x[0] for x in outputs]) sesqs = np.array([x[1] for x in outputs]) betas[np.where(betas == 0)] = 1e-10 sesqs[np.where(sesqs <= 0)] = 1e-10 sesqinvs = 1. / sesqs betam = np.sum(betas * sesqinvs) / np.sum(sesqinvs) sesqm = np.sqrt(1. / np.sum(sesqinvs)) zmeta = betam / sesqm pval = 2 * st.norm.sf(zmeta) # two-sided try: pvalsDict[pval].append([pair, zmeta]) except KeyError: pvalsDict[pval] = [[pair, zmeta]] # sort p-values and save pvals = pvalsDict.keys() pvals.sort() with open('%s/meta_logistic_%s.sortedpvals' % (rDir, phenotypeName), 'w') as f: f.write( "SNP1 & chr & pos & SNP2 & chr & pos & meta z-score & meta pval \n" ) for pval in pvals: for [pair, zmeta] in pvalsDict[pval]: [snp1idx, snp2idx] = pair.split("/") snp1 = snp1idx snp2 = snp2idx f.write("%s & %s & %.2e & %.2e\n" % \ (" & ".join(snp1.split("_")), " & ".join(snp2.split("_")), zmeta, pval)) f.close()
def main(args): usage = """python %s <glideIn> <snps list> <pheno> <confounders> <significant pairs of SNPs> <reevaluated pairs of SNPs> Rerun linear regression evaluation on the significant pairs of SNPs. E.g.: py rerun_confounder.py mydata_final_clean.glideIn mydata_final_clean.snpNames mydata_samples.sex mydata_final_clean.pheno nydata_clean.sortedpvals mydata_final_clean.sortedpvals.rerun_sex\n""" % args[ 0] if len(args) != 7: sys.stderr.write(usage) sys.exit(0) glideInF = args[1] snpsLstF = args[2] cfdrsF = args[3] phenoF = args[4] sigPairF = args[5] outputF = args[6] threshold = 1e-11 pheno = np.loadtxt(phenoF) confounders = np.loadtxt(cfdrsF) f = open(snpsLstF) snpsDict = {line.split()[0]: idx for (idx, line) in enumerate(f)} f.close() newPvalsDict = {} # pvalue, line to write with open(sigPairF) as f: f.readline() for i, line in enumerate(f): ls = line.split() try: snp1idx = snpsDict[ls[0]] try: snp2idx = snpsDict[ls[1]] snp1x = readSnp(snp1idx, glideInF) snp2x = readSnp(snp2idx, glideInF) pval, pvalStr = testPair(snp1x, snp2x, pheno, confounders) line_to_write = "%s\t%s\n" % ("\t".join(ls), pvalStr) if not newPvalsDict.has_key(pval): newPvalsDict[pval] = [line_to_write] else: newPvalsDict[pval].append(line_to_write) except KeyError: print "Didn't find %s in SNPs list" % ls[3] sys.exit(-1) except KeyError: print "Didn't find %s in SNPs list" % ls[0] sys.exit(-1) f.close() with open(outputF, 'w') as g: g.write("SNP1\tchr1\tpos1\tSNP2\tchr2\tpos2\t") g.write( "t-testGLIDE\tpvalGLIDE(intercept)\tpvalGLIDE(x1)\tpvalGLIDE(x2)\tpvalGLIDE(x1:x2)" ) g.write( "\tpval(intercept)\tpval(x1)\tpval(x2)\tpval(confounder(s)\tpval(x1:x2)\n" ) sortedPvals = newPvalsDict.keys() sortedPvals.sort() for pval in sortedPvals: for line_to_write in newPvalsDict[pval]: g.write(line_to_write) g.close()
def main(args): usage = """python %s <glideIn> <snps list> <pheno> <confounders> <significant pairs of SNPs> <reevaluated pairs of SNPs> Rerun linear regression evaluation on the significant pairs of SNPs. E.g.: py rerun_confounder.py mydata_final_clean.h5 mydata_final_clean.bim mydata_samples.sex mydata_final_clean.phenoGlide mydata_final_clean.sortedpvals mydata_final_clean.sortedpvals.rerun_sex\n""" % args[ 0] if len(args) != 7: sys.stderr.write(usage) sys.exit(0) h5fname = args[1] bimF = args[2] cfdrsF = args[3] phenoF = args[4] sigPairF = args[5] outputF = args[6] threshold = 1e-11 pheno = np.loadtxt(phenoF) confounders = np.loadtxt(cfdrsF) with open(bimF) as f: snpsDict = {('%s\t%s\t%s' % \ (line.split()[1], line.split()[0], line.split()[3])):idx \ for (idx, line) in enumerate(f)} newPvalsDict = {} # pvalue, line to write with tables.openFile(h5fname) as h5f, open(sigPairF) as f: print "%d SNPs x %d samples" % (h5f.root.genotype.shape[0], h5f.root.genotype.shape[1]) f.readline() for i, line in enumerate(f): ls = line.split() try: snp1idx = snpsDict['%s\t%s\t%s' % (ls[0], ls[1], ls[2])] try: snp2idx = snpsDict['%s\t%s\t%s' % (ls[3], ls[4], ls[5])] snp1x = np.ma.masked_values(h5f.root.genotype[snp1idx], 3) snp2x = np.ma.masked_values(h5f.root.genotype[snp2idx], 3) pval, pvalStr = testPair(snp1x, snp2x, pheno, confounders) line_to_write = "%s\t%s\n" % ("\t".join(ls), pvalStr) if not newPvalsDict.has_key(pval): newPvalsDict[pval] = [line_to_write] else: newPvalsDict[pval].append(line_to_write) except KeyError: print "Didn't find %s in .bim file" % ls[3] sys.exit(-1) except KeyError: print "Didn't find %s in .bim file" % ls[0] sys.exit(-1) f.close() h5f.close() with open(outputF, 'w') as g: g.write("SNP1\tchr1\tpos1\tSNP2\tchr2\tpos2\t") g.write( "t-testGLIDE\tpvalGLIDE(intercept)\tpvalGLIDE(x1)\tpvalGLIDE(x2)\tpvalGLIDE(x1:x2)" ) g.write( "\tpval(intercept)\tpval(x1)\tpval(x2)\tpval(confounder(s))\tpval(x1:x2)\n" ) sortedPvals = newPvalsDict.keys() sortedPvals.sort() for pval in sortedPvals: for line_to_write in newPvalsDict[pval]: g.write(line_to_write) g.close()