Exemplo n.º 1
0
def main(args):
    usage = """python %s <intermediate phenotye>
    Run a meta analysis of the top-ranked pairs for the different cohorts.
    Save results in %s/meta_<intermediate phenotype>.sortedpvals.
    """ % (args[0], rDir)

    if len(args) == 2:
        phenotypeName = args[1]
    else:
        print usage
        sys.exit(0)

    # Get the list of pairs to run
    threshold = 1e-12
    sigPairsList = set([])
    for cohort in ['Dutch_MA', 'Dutch_MO', 'Finnish_MA', 'German_MA']:
        with open("%s/%s_%s.sortedpvals" % (rDir, cohort, phenotypeName)) as f:
            f.readline()
            for line in f:
                ls = line.split()
                if float(ls[-1]) > threshold:
                    break
                pair = [
                    '%s_%s_%s' % (ls[0], ls[1], ls[2]),
                    '%s_%s_%s' % (ls[3], ls[4], ls[5])
                ]
                pair.sort()
                pair = "/".join(pair)
                sigPairsList.add(pair)
        f.close()
    sigPairsList = list(sigPairsList)

    # Only keep pairs of SNPs that appear in all studies
    for cohort in ['Dutch_MA', 'Dutch_MO', 'Finnish_MA', 'German_MA']:
        f = open('%s/%s/%s_clean_%s.bim' %
                 (dataDirRoot, cohort, cohort, phenotypeName))
        if not allSnps:
            allSnps = set([line.split()[1] for line in f])
        else:
            allSnps = allSnps.intersection(set([line.split()[1]
                                                for line in f]))
        f.close()

    print len(allSnps), "SNPs appear in all studies"

    sigPairsList = [x for x in sigPairsList if \
                    ((x.split("/")[0].split("_")[0] in allSnps) \
                     and (x.split("/")[1].split("_")[0] in allSnps))]
    print len(sigPairsList), "pairs to run."

    # Test pairs for each study
    sigPairsDict = {pair: [] for pair in sigPairsList}
    for cohort in ['Dutch_MA', 'Dutch_MO', 'Finnish_MA', 'German_MA']:
        print "Testing for %s" % cohort
        dataDir = '%s/%s' % (dataDirRoot, cohort)
        phenoF = '%s/%s_clean_%s.phenoGlide' % (dataDir, cohort, phenotypeName)
        bimF = '%s/%s_clean_%s.bim' % (dataDir, cohort, phenotypeName)
        h5fname = '%s/%s_clean_%s.h5' % (dataDir, cohort, phenotypeName)

        pheno = np.loadtxt(phenoF)

        with open(bimF) as f:
            snpsDict = {('%s_%s_%s' % \
                         (line.split()[1], line.split()[0], line.split()[3])):idx \
                        for (idx, line) in enumerate(f)}
        f.close()

        with tables.openFile(h5fname) as h5f:
            for pair in sigPairsList:
                pairSplit = pair.split("/")
                snp1idx = snpsDict[pairSplit[0]]
                snp2idx = snpsDict[pairSplit[1]]
                snp1x = np.ma.masked_values(h5f.root.genotype[snp1idx], 3)
                snp2x = np.ma.masked_values(h5f.root.genotype[snp2idx], 3)
                sigPairsDict[pair].append(testPair(snp1x, snp2x, pheno))
        h5f.close()

    # Combine into a meta-analysis
    pvalsDict = {}
    for pair, outputs in sigPairsDict.iteritems():
        betas = np.array([x[0] for x in outputs])
        sesqs = np.array([x[1] for x in outputs])

        betas[np.where(betas == 0)] = 1e-10
        sesqs[np.where(sesqs <= 0)] = 1e-10
        sesqinvs = 1. / sesqs

        betam = np.sum(betas * sesqinvs) / np.sum(sesqinvs)
        sesqm = np.sqrt(1. / np.sum(sesqinvs))
        zmeta = betam / sesqm
        pval = 2 * st.norm.sf(zmeta)  # two-sided

        try:
            pvalsDict[pval].append([pair, zmeta])
        except KeyError:
            pvalsDict[pval] = [[pair, zmeta]]

    # sort p-values and save
    pvals = pvalsDict.keys()
    pvals.sort()
    with open('%s/meta_logistic_%s.sortedpvals' % (rDir, phenotypeName),
              'w') as f:
        f.write(
            "SNP1 & chr & pos & SNP2 & chr & pos & meta z-score & meta pval \n"
        )
        for pval in pvals:
            for [pair, zmeta] in pvalsDict[pval]:
                [snp1idx, snp2idx] = pair.split("/")
                snp1 = snp1idx
                snp2 = snp2idx
                f.write("%s & %s & %.2e & %.2e\n" % \
                        (" & ".join(snp1.split("_")),
                         " & ".join(snp2.split("_")),
                         zmeta, pval))
        f.close()
Exemplo n.º 2
0
def main(args):
    usage = """python %s <glideIn> <snps list> <pheno> <confounders> <significant pairs of SNPs> <reevaluated pairs of SNPs>
    Rerun linear regression evaluation on the significant pairs of SNPs.
    E.g.: py rerun_confounder.py mydata_final_clean.glideIn mydata_final_clean.snpNames mydata_samples.sex mydata_final_clean.pheno nydata_clean.sortedpvals mydata_final_clean.sortedpvals.rerun_sex\n""" % args[
        0]
    if len(args) != 7:
        sys.stderr.write(usage)
        sys.exit(0)

    glideInF = args[1]
    snpsLstF = args[2]
    cfdrsF = args[3]
    phenoF = args[4]
    sigPairF = args[5]
    outputF = args[6]

    threshold = 1e-11

    pheno = np.loadtxt(phenoF)
    confounders = np.loadtxt(cfdrsF)

    f = open(snpsLstF)
    snpsDict = {line.split()[0]: idx for (idx, line) in enumerate(f)}
    f.close()

    newPvalsDict = {}  # pvalue, line to write
    with open(sigPairF) as f:
        f.readline()
        for i, line in enumerate(f):
            ls = line.split()
            try:
                snp1idx = snpsDict[ls[0]]
                try:
                    snp2idx = snpsDict[ls[1]]
                    snp1x = readSnp(snp1idx, glideInF)
                    snp2x = readSnp(snp2idx, glideInF)

                    pval, pvalStr = testPair(snp1x, snp2x, pheno, confounders)
                    line_to_write = "%s\t%s\n" % ("\t".join(ls), pvalStr)

                    if not newPvalsDict.has_key(pval):
                        newPvalsDict[pval] = [line_to_write]
                    else:
                        newPvalsDict[pval].append(line_to_write)

                except KeyError:
                    print "Didn't find %s in SNPs list" % ls[3]
                    sys.exit(-1)
            except KeyError:
                print "Didn't find %s in SNPs list" % ls[0]
                sys.exit(-1)
    f.close()

    with open(outputF, 'w') as g:
        g.write("SNP1\tchr1\tpos1\tSNP2\tchr2\tpos2\t")
        g.write(
            "t-testGLIDE\tpvalGLIDE(intercept)\tpvalGLIDE(x1)\tpvalGLIDE(x2)\tpvalGLIDE(x1:x2)"
        )
        g.write(
            "\tpval(intercept)\tpval(x1)\tpval(x2)\tpval(confounder(s)\tpval(x1:x2)\n"
        )

        sortedPvals = newPvalsDict.keys()
        sortedPvals.sort()
        for pval in sortedPvals:
            for line_to_write in newPvalsDict[pval]:
                g.write(line_to_write)
    g.close()
def main(args):
    usage = """python %s <glideIn> <snps list> <pheno> <confounders> <significant pairs of SNPs> <reevaluated pairs of SNPs>
    Rerun linear regression evaluation on the significant pairs of SNPs.
    E.g.: py rerun_confounder.py mydata_final_clean.h5 mydata_final_clean.bim mydata_samples.sex mydata_final_clean.phenoGlide mydata_final_clean.sortedpvals mydata_final_clean.sortedpvals.rerun_sex\n""" % args[
        0]
    if len(args) != 7:
        sys.stderr.write(usage)
        sys.exit(0)

    h5fname = args[1]
    bimF = args[2]
    cfdrsF = args[3]
    phenoF = args[4]
    sigPairF = args[5]
    outputF = args[6]

    threshold = 1e-11

    pheno = np.loadtxt(phenoF)
    confounders = np.loadtxt(cfdrsF)

    with open(bimF) as f:
        snpsDict = {('%s\t%s\t%s' % \
                     (line.split()[1], line.split()[0], line.split()[3])):idx \
                    for (idx, line) in enumerate(f)}

    newPvalsDict = {}  # pvalue, line to write
    with tables.openFile(h5fname) as h5f, open(sigPairF) as f:
        print "%d SNPs x %d samples" % (h5f.root.genotype.shape[0],
                                        h5f.root.genotype.shape[1])
        f.readline()
        for i, line in enumerate(f):
            ls = line.split()
            try:
                snp1idx = snpsDict['%s\t%s\t%s' % (ls[0], ls[1], ls[2])]
                try:
                    snp2idx = snpsDict['%s\t%s\t%s' % (ls[3], ls[4], ls[5])]
                    snp1x = np.ma.masked_values(h5f.root.genotype[snp1idx], 3)
                    snp2x = np.ma.masked_values(h5f.root.genotype[snp2idx], 3)

                    pval, pvalStr = testPair(snp1x, snp2x, pheno, confounders)
                    line_to_write = "%s\t%s\n" % ("\t".join(ls), pvalStr)

                    if not newPvalsDict.has_key(pval):
                        newPvalsDict[pval] = [line_to_write]
                    else:
                        newPvalsDict[pval].append(line_to_write)

                except KeyError:
                    print "Didn't find %s in .bim file" % ls[3]
                    sys.exit(-1)
            except KeyError:
                print "Didn't find %s in .bim file" % ls[0]
                sys.exit(-1)
    f.close()
    h5f.close()

    with open(outputF, 'w') as g:
        g.write("SNP1\tchr1\tpos1\tSNP2\tchr2\tpos2\t")
        g.write(
            "t-testGLIDE\tpvalGLIDE(intercept)\tpvalGLIDE(x1)\tpvalGLIDE(x2)\tpvalGLIDE(x1:x2)"
        )
        g.write(
            "\tpval(intercept)\tpval(x1)\tpval(x2)\tpval(confounder(s))\tpval(x1:x2)\n"
        )

        sortedPvals = newPvalsDict.keys()
        sortedPvals.sort()
        for pval in sortedPvals:
            for line_to_write in newPvalsDict[pval]:
                g.write(line_to_write)
    g.close()