def main(args): argp = ap.ArgumentParser(description="Generate a random genotype") argp.add_argument("n", type=int, help="Pop number.") argp.add_argument("m", type=int, help="Genome length.") argp.add_argument("f", type=float, help="Ref allele freq") argp.add_argument("h2", type=float, help="Narrow-sense Heritability") argp.add_argument("prefix", help="Output prefix") argp.add_argument("-o", "--output", type=ap.FileType("w"), default=sys.stdout, help="Where to write numpy matrix") argp.add_argument("-o2", "--output2", type=ap.FileType("w"), default=sys.stdout) args = argp.parse_args(args) g = np.random.binomial(2, args.f, size=(args.n, args.m)) n, m = g.shape np.save(args.output, g) g = g.astype(float) f = np.mean(g, axis=0) / 2 # standardize the genotype #z = (g - (2 * f)) / np.sqrt(2 * f * (1 - f)) v = np.var(g, axis=0) z = (g - (2 * f)) / np.sqrt(v) # sample effects betas = np.random.normal(0, math.sqrt(args.h2 / float(m)), m) # compute sample variances for betas and noise given h2 g = z.dot(betas) s2g = np.var(g) s2e = s2g * ((1.0 / args.h2) - 1) # create phenotypes e = np.random.normal(0, math.sqrt(s2e), n) y = g + e # standardize y = (y - np.mean(y)) / np.std(y) # output phenotype mapping with open("{}.phen".format(args.prefix), "w") as phenfile: for idx, p in enumerate(y): fid = "FID{}".format(idx) iid = "IID{}".format(idx) phenfile.write("{} {} {}{}".format(fid, iid, p, os.linesep)) # compute GRM w = (1.0 / float(m)) * z.dot(z.T) # output GRM files in GCTA bin format with open("{}.grm.bin".format(args.prefix), "wb") as grmfile: for idx in range(n): for jdx in range(idx + 1): val = struct.pack('f', w[idx, jdx]) grmfile.write(val) val = struct.pack('i', int(m)) with open("{}.grm.N.bin".format(args.prefix), "wb") as grmfile: for idx in range(n): for jdx in range(idx + 1): grmfile.write(val) with open("{}.grm.id".format(args.prefix), "w") as grmfile: for idx in range(n): fid = "FID{}".format(idx) iid = "IID{}".format(idx) grmfile.write("\t".join([fid, iid]) + os.linesep) # compute h2g estimates with AI-REML GCTA-style initial = np.array([.5, .5]) h2g = reml.aiREML(w, y, initial, X=None, calc_se=True, max_iter=500) var, se, s = h2g total = sum(var) args.output2.write("\t".join(["Source", "Variance", "SE"]) + os.linesep) args.output2.write("\t".join( ["V(G)", fformat(var[0]), fformat(math.sqrt(s[0, 0]))]) + os.linesep) args.output2.write("\t".join( ["V(e)", fformat(var[1]), fformat(math.sqrt(s[1, 1]))]) + os.linesep) args.output2.write("\t".join( ["V(G)/Vp", fformat(var[0] / total), fformat(math.sqrt(se[0]))]) + os.linesep) args.output2.write("Variance/Covariance Matrix" + os.linesep) args.output2.write(str(s) + os.linesep) return 0
def main(args): argp = ap.ArgumentParser(description="Generate a random genotype") argp.add_argument("n", type=int, help="Pop number.") argp.add_argument("m", type=int, help="Genome length.") argp.add_argument("f", type=float, help="Ref allele freq") argp.add_argument("h2", type=float, help="Narrow-sense Heritability") argp.add_argument("prefix", help="Output prefix") argp.add_argument("-o", "--output", type=ap.FileType("w"), default=sys.stdout, help="Where to write numpy matrix") argp.add_argument("-o2", "--output2", type=ap.FileType("w"), default=sys.stdout) args = argp.parse_args(args) g = np.random.binomial(2, args.f, size=(args.n, args.m)) n, m = g.shape np.save(args.output, g) g = g.astype(float) f = np.mean(g, axis=0) / 2 # standardize the genotype #z = (g - (2 * f)) / np.sqrt(2 * f * (1 - f)) v = np.var(g, axis=0) z = (g - (2 * f)) / np.sqrt(v) # sample effects betas = np.random.normal(0, math.sqrt(args.h2 / float(m)), m) # compute sample variances for betas and noise given h2 g = z.dot(betas) s2g = np.var(g) s2e = s2g * ( (1.0 / args.h2) - 1 ) # create phenotypes e = np.random.normal(0, math.sqrt(s2e), n) y = g + e # standardize y = (y - np.mean(y)) / np.std(y) # output phenotype mapping with open("{}.phen".format(args.prefix), "w") as phenfile: for idx, p in enumerate(y): fid = "FID{}".format(idx) iid = "IID{}".format(idx) phenfile.write("{} {} {}{}".format(fid, iid, p, os.linesep)) # compute GRM w = (1.0 / float(m)) * z.dot(z.T) # output GRM files in GCTA bin format with open("{}.grm.bin".format(args.prefix), "wb") as grmfile: for idx in range(n): for jdx in range(idx + 1): val = struct.pack('f', w[idx, jdx]) grmfile.write(val) val = struct.pack('i', int(m)) with open("{}.grm.N.bin".format(args.prefix), "wb") as grmfile: for idx in range(n): for jdx in range(idx + 1): grmfile.write(val) with open("{}.grm.id".format(args.prefix), "w") as grmfile: for idx in range(n): fid = "FID{}".format(idx) iid = "IID{}".format(idx) grmfile.write("\t".join([fid, iid]) + os.linesep) # compute h2g estimates with AI-REML GCTA-style initial = np.array([.5, .5]) h2g = reml.aiREML(w, y, initial, X=None, calc_se=True, max_iter=500) var, se, s = h2g total = sum(var) args.output2.write("\t".join(["Source", "Variance", "SE"]) + os.linesep) args.output2.write("\t".join(["V(G)", fformat(var[0]), fformat(math.sqrt(s[0, 0]))]) + os.linesep) args.output2.write("\t".join(["V(e)", fformat(var[1]), fformat(math.sqrt(s[1, 1]))]) + os.linesep) args.output2.write("\t".join(["V(G)/Vp", fformat(var[0] / total), fformat(math.sqrt(se[0]))]) + os.linesep) args.output2.write("Variance/Covariance Matrix" + os.linesep) args.output2.write(str(s) + os.linesep) return 0
def main(args): argp = ap.ArgumentParser(description="Generate Likelihoods from a Genotype.") argp.add_argument("geno", type=ap.FileType("r"), help="Genotype numpy matrix.") argp.add_argument("pheno", type=ap.FileType("r"), help="Phenotype") argp.add_argument("cov", type=float, help="The mean coverage amount.") argp.add_argument("-m", "--method", choices=["EM", "REML"], default="REML") argp.add_argument("-s", "--use_sample_var", action="store_true", default=False) argp.add_argument("-v", "--verbose", action="store_true", default=False) argp.add_argument("-e", "--errorrate", type=float, help="Sequencing error rate.", default=0.01) argp.add_argument("-o", "--output", type=ap.FileType("w"), default=sys.stdout) args = argp.parse_args(args) geno = np.load(args.geno) pheno = np.loadtxt(args.pheno, dtype=str) pheno = np.array(pheno.T[2], dtype=float) n, m = geno.shape likes = np.zeros((3, m)) g = np.array([np.arange(3) for _ in range(m)]) freqs = np.mean(geno, axis=0) / 2.0 probs = np.zeros((3, m)) probs[0] = (1 - freqs) ** 2 probs[1] = 2 * (1 - freqs) * freqs probs[2] = freqs ** 2 # compute coverage per person per snp rows = [] for idx in range(n): sgeno = geno[idx] covs = np.random.poisson(args.cov, size=m) # older version of scipy has bug if you pass 0... do this for workaround num1s = np.zeros(m) num1s[covs != 0] = sts.binom.rvs(covs[covs != 0], 1 - args.errorrate) # flip num1s for homozygous minor case mask = sgeno == 0 num1s[mask] = covs[mask] - num1s[mask] # simulate reads for heterozygous case where coverage is positive mask = np.logical_and(sgeno == 1, covs != 0) num1s[mask] = sts.binom.rvs(covs[mask], 0.5) num0s = covs - num1s likes[0][:] = (math.log(args.errorrate) * num1s) + (math.log(1 - args.errorrate) * num0s) likes[1][:] = math.log(0.5) * covs likes[2][:] = (math.log(args.errorrate) * num0s) + (math.log(1 - args.errorrate) * num1s) # convert to likelihoods likes = np.exp(likes) likes = likes / np.sum(likes, axis=0) # get expected-genotype dosages #row = np.sum(likes * probs * g.T, axis=0) / np.sum(likes * probs, axis=0) row = np.sum(likes * g.T, axis=0) / np.sum(likes, axis=0) rows.append(row) d = np.array(rows) # standardize genotype #z = (d - 2 * freqs) / np.sqrt(2 * freqs * (1 - freqs)) z = (d - 2 * freqs) / np.std(d, axis=0) # create GRM w = (1 / float(m)) * z.dot(z.T) #import pdb; pdb.set_trace() initial = np.array([.5, .5]) if args.method == "EM": h2g = reml.emREML(w, pheno, initial, X=None, calc_se=True, max_iter=500, verbose=args.verbose) elif args.method == "REML": h2g = reml.aiREML(w, pheno, initial, X=None, calc_se=True, max_iter=500, verbose=args.verbose) var, se, s = h2g total = sum(var) args.output.write("\t".join(["Source", "Variance", "SE"]) + os.linesep) args.output.write("\t".join(["V(G)", fformat(var[0]), fformat(math.sqrt(s[0, 0]))]) + os.linesep) args.output.write("\t".join(["V(e)", fformat(var[1]), fformat(math.sqrt(s[1, 1]))]) + os.linesep) args.output.write("\t".join(["V(G)/Vp", fformat(var[0] / total), fformat(se[0])]) + os.linesep) args.output.write("Variance/Covariance Matrix" + os.linesep) args.output.write(str(s) + os.linesep) return 0