예제 #1
0
def main(args):
    argp = ap.ArgumentParser(description="Generate a random genotype")
    argp.add_argument("n", type=int, help="Pop number.")
    argp.add_argument("m", type=int, help="Genome length.")
    argp.add_argument("f", type=float, help="Ref allele freq")
    argp.add_argument("h2", type=float, help="Narrow-sense Heritability")
    argp.add_argument("prefix", help="Output prefix")
    argp.add_argument("-o",
                      "--output",
                      type=ap.FileType("w"),
                      default=sys.stdout,
                      help="Where to write numpy matrix")
    argp.add_argument("-o2",
                      "--output2",
                      type=ap.FileType("w"),
                      default=sys.stdout)

    args = argp.parse_args(args)
    g = np.random.binomial(2, args.f, size=(args.n, args.m))
    n, m = g.shape

    np.save(args.output, g)

    g = g.astype(float)
    f = np.mean(g, axis=0) / 2

    # standardize the genotype
    #z = (g - (2 * f)) / np.sqrt(2 * f * (1 - f))
    v = np.var(g, axis=0)

    z = (g - (2 * f)) / np.sqrt(v)

    # sample effects
    betas = np.random.normal(0, math.sqrt(args.h2 / float(m)), m)

    # compute sample variances for betas and noise given h2
    g = z.dot(betas)

    s2g = np.var(g)
    s2e = s2g * ((1.0 / args.h2) - 1)

    # create phenotypes
    e = np.random.normal(0, math.sqrt(s2e), n)
    y = g + e

    # standardize
    y = (y - np.mean(y)) / np.std(y)

    # output phenotype mapping
    with open("{}.phen".format(args.prefix), "w") as phenfile:
        for idx, p in enumerate(y):
            fid = "FID{}".format(idx)
            iid = "IID{}".format(idx)
            phenfile.write("{} {} {}{}".format(fid, iid, p, os.linesep))

    # compute GRM
    w = (1.0 / float(m)) * z.dot(z.T)

    # output GRM files in GCTA bin format
    with open("{}.grm.bin".format(args.prefix), "wb") as grmfile:
        for idx in range(n):
            for jdx in range(idx + 1):
                val = struct.pack('f', w[idx, jdx])
                grmfile.write(val)

    val = struct.pack('i', int(m))
    with open("{}.grm.N.bin".format(args.prefix), "wb") as grmfile:
        for idx in range(n):
            for jdx in range(idx + 1):
                grmfile.write(val)

    with open("{}.grm.id".format(args.prefix), "w") as grmfile:
        for idx in range(n):
            fid = "FID{}".format(idx)
            iid = "IID{}".format(idx)
            grmfile.write("\t".join([fid, iid]) + os.linesep)

    # compute h2g estimates with AI-REML GCTA-style
    initial = np.array([.5, .5])
    h2g = reml.aiREML(w, y, initial, X=None, calc_se=True, max_iter=500)

    var, se, s = h2g
    total = sum(var)
    args.output2.write("\t".join(["Source", "Variance", "SE"]) + os.linesep)
    args.output2.write("\t".join(
        ["V(G)", fformat(var[0]),
         fformat(math.sqrt(s[0, 0]))]) + os.linesep)
    args.output2.write("\t".join(
        ["V(e)", fformat(var[1]),
         fformat(math.sqrt(s[1, 1]))]) + os.linesep)
    args.output2.write("\t".join(
        ["V(G)/Vp",
         fformat(var[0] / total),
         fformat(math.sqrt(se[0]))]) + os.linesep)
    args.output2.write("Variance/Covariance Matrix" + os.linesep)
    args.output2.write(str(s) + os.linesep)

    return 0
예제 #2
0
파일: genGeno.py 프로젝트: quattro/labtools
def main(args):
    argp = ap.ArgumentParser(description="Generate a random genotype")
    argp.add_argument("n", type=int, help="Pop number.")
    argp.add_argument("m", type=int, help="Genome length.")
    argp.add_argument("f", type=float, help="Ref allele freq")
    argp.add_argument("h2", type=float, help="Narrow-sense Heritability")
    argp.add_argument("prefix", help="Output prefix")
    argp.add_argument("-o", "--output", type=ap.FileType("w"), default=sys.stdout,
                      help="Where to write numpy matrix")
    argp.add_argument("-o2", "--output2", type=ap.FileType("w"), default=sys.stdout)

    args = argp.parse_args(args)
    g = np.random.binomial(2, args.f, size=(args.n, args.m))
    n, m = g.shape

    np.save(args.output, g)

    g = g.astype(float)
    f = np.mean(g, axis=0) / 2

    # standardize the genotype
    #z = (g - (2 * f)) / np.sqrt(2 * f * (1 - f))
    v = np.var(g, axis=0)

    z = (g - (2 * f)) / np.sqrt(v)

    # sample effects
    betas = np.random.normal(0, math.sqrt(args.h2 / float(m)), m)

    # compute sample variances for betas and noise given h2
    g = z.dot(betas)

    s2g = np.var(g)
    s2e = s2g * ( (1.0 / args.h2) - 1 )

    # create phenotypes
    e = np.random.normal(0, math.sqrt(s2e), n)
    y = g + e

    # standardize
    y = (y - np.mean(y)) / np.std(y)

    # output phenotype mapping
    with open("{}.phen".format(args.prefix), "w") as phenfile:
        for idx, p in enumerate(y):
            fid = "FID{}".format(idx)
            iid = "IID{}".format(idx)
            phenfile.write("{} {} {}{}".format(fid, iid, p, os.linesep))

    # compute GRM
    w = (1.0 / float(m)) * z.dot(z.T)

    # output GRM files in GCTA bin format
    with open("{}.grm.bin".format(args.prefix), "wb") as grmfile:
        for idx in range(n):
            for jdx in range(idx + 1):
                val = struct.pack('f', w[idx, jdx])
                grmfile.write(val)

    val = struct.pack('i', int(m))
    with open("{}.grm.N.bin".format(args.prefix), "wb") as grmfile:
        for idx in range(n):
            for jdx in range(idx + 1):
                grmfile.write(val)

    with open("{}.grm.id".format(args.prefix), "w") as grmfile:
        for idx in range(n):
            fid = "FID{}".format(idx)
            iid = "IID{}".format(idx)
            grmfile.write("\t".join([fid, iid]) + os.linesep)

    # compute h2g estimates with AI-REML GCTA-style
    initial = np.array([.5, .5])
    h2g = reml.aiREML(w, y, initial, X=None, calc_se=True, max_iter=500)

    var, se, s = h2g
    total = sum(var)
    args.output2.write("\t".join(["Source", "Variance", "SE"]) + os.linesep)
    args.output2.write("\t".join(["V(G)", fformat(var[0]), fformat(math.sqrt(s[0, 0]))]) + os.linesep)
    args.output2.write("\t".join(["V(e)", fformat(var[1]), fformat(math.sqrt(s[1, 1]))]) + os.linesep)
    args.output2.write("\t".join(["V(G)/Vp", fformat(var[0] / total), fformat(math.sqrt(se[0]))]) + os.linesep)
    args.output2.write("Variance/Covariance Matrix" + os.linesep)
    args.output2.write(str(s) + os.linesep)

    return 0
예제 #3
0
def main(args):
    argp = ap.ArgumentParser(description="Generate Likelihoods from a Genotype.")
    argp.add_argument("geno", type=ap.FileType("r"), help="Genotype numpy matrix.")
    argp.add_argument("pheno", type=ap.FileType("r"), help="Phenotype")
    argp.add_argument("cov", type=float, help="The mean coverage amount.")
    argp.add_argument("-m", "--method", choices=["EM", "REML"], default="REML")
    argp.add_argument("-s", "--use_sample_var", action="store_true", default=False)
    argp.add_argument("-v", "--verbose", action="store_true", default=False)
    argp.add_argument("-e", "--errorrate", type=float, help="Sequencing error rate.",
                      default=0.01)
    argp.add_argument("-o", "--output", type=ap.FileType("w"),
                      default=sys.stdout)

    args = argp.parse_args(args)
    geno = np.load(args.geno)

    pheno = np.loadtxt(args.pheno, dtype=str)
    pheno = np.array(pheno.T[2], dtype=float)

    n, m = geno.shape

    likes = np.zeros((3, m))
    g = np.array([np.arange(3) for _ in range(m)])

    freqs = np.mean(geno, axis=0) / 2.0
    probs = np.zeros((3, m))

    probs[0] = (1 - freqs) ** 2
    probs[1] = 2 * (1 - freqs) * freqs
    probs[2] = freqs ** 2

    # compute coverage per person per snp
    rows = []
    for idx in range(n):
        sgeno = geno[idx]

        covs = np.random.poisson(args.cov, size=m)

        # older version of scipy has bug if you pass 0... do this for workaround
        num1s = np.zeros(m)
        num1s[covs != 0] = sts.binom.rvs(covs[covs != 0], 1 - args.errorrate)

        # flip num1s for homozygous minor case
        mask = sgeno == 0
        num1s[mask] = covs[mask] - num1s[mask]

        # simulate reads for heterozygous case where coverage is positive
        mask = np.logical_and(sgeno == 1, covs != 0)
        num1s[mask] = sts.binom.rvs(covs[mask], 0.5)

        num0s = covs - num1s

        likes[0][:] = (math.log(args.errorrate) * num1s) + (math.log(1 - args.errorrate) * num0s)
        likes[1][:] = math.log(0.5) * covs
        likes[2][:] = (math.log(args.errorrate) * num0s) + (math.log(1 - args.errorrate) * num1s)

        # convert to likelihoods
        likes = np.exp(likes)
        likes = likes / np.sum(likes, axis=0)

        # get expected-genotype dosages
        #row = np.sum(likes * probs * g.T, axis=0) / np.sum(likes * probs, axis=0)
        row = np.sum(likes * g.T, axis=0) / np.sum(likes, axis=0)
        rows.append(row)

    d = np.array(rows)
    # standardize genotype
    #z = (d - 2 * freqs) / np.sqrt(2 * freqs * (1 - freqs))
    z = (d - 2 * freqs) / np.std(d, axis=0)


    # create GRM
    w = (1 / float(m)) * z.dot(z.T)

    #import pdb; pdb.set_trace()
    initial = np.array([.5, .5])
    if args.method == "EM":
        h2g = reml.emREML(w, pheno, initial, X=None, calc_se=True, max_iter=500, verbose=args.verbose)
    elif args.method == "REML":
        h2g = reml.aiREML(w, pheno, initial, X=None, calc_se=True, max_iter=500, verbose=args.verbose)

    var, se, s = h2g
    total = sum(var)
    args.output.write("\t".join(["Source", "Variance", "SE"]) + os.linesep)
    args.output.write("\t".join(["V(G)", fformat(var[0]), fformat(math.sqrt(s[0, 0]))]) + os.linesep)
    args.output.write("\t".join(["V(e)", fformat(var[1]), fformat(math.sqrt(s[1, 1]))]) + os.linesep)
    args.output.write("\t".join(["V(G)/Vp", fformat(var[0] / total), fformat(se[0])]) + os.linesep)
    args.output.write("Variance/Covariance Matrix" + os.linesep)
    args.output.write(str(s) + os.linesep)

    return 0