def main(args): np.random.seed(args.beta_num) sim = SumstatSimulation(args.sim_name) arch = Architecture(sim.architecture) d = Dataset(sim.dataset) # sample the beta beta = arch.draw_effect_sizes(sim.dataset, sim.h2g)[:, 0] # compute noiseless phenotypes slice by slice Y = np.zeros(d.N) t0 = time() for s in d.slices(): # X will be N x M print(int(time() - t0), ": getting genotypes from file. SNPs", s) X = d.get_standardized_genotypes(s) print("computing phenotypes. SNPs", s) Y += X.dot(beta[s[0] : s[1]]) del X # normalize the Y and the beta to the desired heritability normalization = np.std(Y) / np.sqrt(sim.h2g) if normalization == 0: normalization = 1 # just in case we have some 0s... Y /= normalization beta /= normalization # write the betas and the noiseless phenotypes pickle.dump(beta, sim.beta_file(args.beta_num, "wb"), 2) pickle.dump(Y, sim.noiseless_Y_file(args.beta_num, "wb"), 2)
def main(args): np.random.seed(args.beta_num) sim = SumstatSimulation(args.sim_name) arch = Architecture(sim.architecture) d = Dataset(sim.dataset) # sample the beta beta = arch.draw_effect_sizes(sim.dataset, sim.h2g)[:, 0] # compute noiseless phenotypes slice by slice Y = np.zeros(d.N) t0 = time() for s in d.slices(): # X will be N x M print(int(time() - t0), ': getting genotypes from file. SNPs', s) X = d.get_standardized_genotypes(s) print('computing phenotypes. SNPs', s) Y += X.dot(beta[s[0]:s[1]]) del X # normalize the Y and the beta to the desired heritability normalization = np.std(Y) / np.sqrt(sim.h2g) if normalization == 0: normalization = 1 # just in case we have some 0s... Y /= normalization beta /= normalization # write the betas and the noiseless phenotypes pickle.dump(beta, sim.beta_file(args.beta_num, 'wb'), 2) pickle.dump(Y, sim.noiseless_Y_file(args.beta_num, 'wb'), 2)
def main(args): np.random.seed(args.beta_num + args.sample_num * 10000) sim = SumstatSimulation(args.sim_name) d = Dataset(sim.dataset) pretty.print_namespace(sim) print() # read in noiseless phenotypes Y = pickle.load(sim.noiseless_Y_file(args.beta_num)) # choose individuals and create ensemble of Ys indices = np.random.choice(Y.shape[0], size=(sim.sample_size, )) Y = Y[indices] # compute how much noise to add sigma2e = 1 - sim.h2g print('adding noise. sigma2e =', sigma2e) Y += np.sqrt(sigma2e) * np.random.randn(*Y.shape) if sim.condition_on_covariates: print('projecting covariates out of Y') Y = d.project_out_covariates(Y, covariates=d.covariates[indices]) alphahat = np.zeros(d.M) t0 = time() def compute_sumstats_for_slice(s): # X will be N x M print(int(time() - t0), ': getting genotypes from file. SNPs', s) X = d.get_standardized_genotypes(s)[indices] if sim.condition_on_covariates: print(int(time() - t0), ': projecting out covariates') X = d.project_out_covariates(X, covariates=d.covariates[indices]) print(int(time() - t0), ': computing sumstats. SNPs', s) alphahat[s[0]:s[1]] = X.T.dot(Y) / sim.sample_size del X map(compute_sumstats_for_slice, d.slices()) # write output def write_output(): pickle.dump(indices, sim.individuals_file(args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(Y, sim.noisy_Y_file(args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(alphahat, sim.sumstats_file(args.beta_num, args.sample_num, 'wb'), 2) write_output()
def main(args): d = Dataset(args.refpanel + '.' + str(args.chrnum)) annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum) cannot_filename = '{}.{}.cannot.gz'.format(args.annot_stem, args.chrnum) cannot_norm_filename = '{}.{}.cannot.norm'.format(args.annot_stem, args.chrnum) annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0) name = annot.columns[-1] v = annot.ix[:, name].values #TODO: use ld blocks, possibly just those that have non-trivial intersection with the # nonzero entries of v print('computing Xv') Xv = np.zeros(d.N) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) Xv += X.dot(v[s[0]:s[1]]) print('computing XTXv') XTXv = np.zeros(d.M) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) XTXv[s[0]:s[1]] = X.T.dot(Xv) print('computing V^TRv') Rv = XTXv / d.N vTRv = v.dot(Rv) # write output print('writing output') annot[name + '.CONV'] = Rv with gzip.open(cannot_filename, 'wt') as f: annot.to_csv(f, index=False, sep='\t') with open(cannot_norm_filename, 'w') as f: f.write(str(vTRv))
def main(args): d = Dataset(args.refpanel + '.' + str(args.chrnum)) annot_filename = '{}.{}.annot.gz'.format(args.annot_stem, args.chrnum) cannot_filename = '{}.{}.cannot.gz'.format(args.annot_stem, args.chrnum) cannot_norm_filename = '{}.{}.cannot.norm'.format(args.annot_stem, args.chrnum) annot = pd.read_csv(annot_filename, compression='gzip', sep='\t', header=0) name = annot.columns[-1] v = annot.ix[:,name].values #TODO: use ld blocks, possibly just those that have non-trivial intersection with the # nonzero entries of v print('computing Xv') Xv = np.zeros(d.N) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) Xv += X.dot(v[s[0]:s[1]]) print('computing XTXv') XTXv = np.zeros(d.M) for s in d.slices(): print(s) X = d.get_standardized_genotypes(s) XTXv[s[0]:s[1]] = X.T.dot(Xv) print('computing V^TRv') Rv = XTXv / d.N vTRv = v.dot(Rv) # write output print('writing output') annot[name+'.CONV'] = Rv with gzip.open(cannot_filename, 'wt') as f: annot.to_csv(f, index=False, sep='\t') with open(cannot_norm_filename, 'w') as f: f.write(str(vTRv))
def main(args): np.random.seed(args.beta_num + args.sample_num * 10000) sim = SumstatSimulation(args.sim_name) d = Dataset(sim.dataset) pretty.print_namespace(sim); print() # read in noiseless phenotypes Y = pickle.load(sim.noiseless_Y_file(args.beta_num)) # choose individuals and create ensemble of Ys indices = np.random.choice(Y.shape[0], size=(sim.sample_size,)) Y = Y[indices] # compute how much noise to add sigma2e = 1 - sim.h2g print('adding noise. sigma2e =', sigma2e) Y += np.sqrt(sigma2e) * np.random.randn(*Y.shape) if sim.condition_on_covariates: print('projecting covariates out of Y') Y = d.project_out_covariates(Y, covariates=d.covariates[indices]) alphahat = np.zeros(d.M) t0 = time() def compute_sumstats_for_slice(s): # X will be N x M print(int(time() - t0), ': getting genotypes from file. SNPs', s) X = d.get_standardized_genotypes(s)[indices] if sim.condition_on_covariates: print(int(time() - t0), ': projecting out covariates') X = d.project_out_covariates(X, covariates=d.covariates[indices]) print(int(time() - t0), ': computing sumstats. SNPs', s) alphahat[s[0]:s[1]] = X.T.dot(Y) / sim.sample_size del X map(compute_sumstats_for_slice, d.slices()) # write output def write_output(): pickle.dump(indices, sim.individuals_file( args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(Y, sim.noisy_Y_file( args.beta_num, args.sample_num, 'wb'), 2) pickle.dump(alphahat, sim.sumstats_file( args.beta_num, args.sample_num, 'wb'), 2) write_output()