def test_diagKtoN(self): """ make sure standardization on SNPs results in sum(diag(K))=N """ np.random.seed(42) m = np.random.random((100, 1000)) from pysnptools.standardizer import DiagKtoN s = DiagKtoN() s.standardize(m) K = m.dot(m.T) sum_diag = np.sum(np.diag(K)) np.testing.assert_almost_equal(100, sum_diag)
def test_diagKtoN(self): """ make sure standardization on SNPs results in sum(diag(K))=N """ np.random.seed(42) m = np.random.random((100,1000)) from pysnptools.standardizer import DiagKtoN s = DiagKtoN() s.standardize(m) K = m.dot(m.T) sum_diag = np.sum(np.diag(K)) np.testing.assert_almost_equal(100, sum_diag)
def test_some_std(self): k0 = self.snpdata.read_kernel(standardizer=Unit()).val from pysnptools.kernelreader import SnpKernel k1 = self.snpdata.read_kernel(standardizer=Unit()) np.testing.assert_array_almost_equal(k0, k1.val, decimal=10) from pysnptools.snpreader import SnpData snpdata2 = SnpData(iid=self.snpdata.iid, sid=self.snpdata.sid, pos=self.snpdata.pos, val=np.array(self.snpdata.val)) s = str(snpdata2) snpdata2.standardize() s = str(snpdata2) snpreader = Bed(self.currentFolder + "/examples/toydata", count_A1=False) k2 = snpreader.read_kernel(standardizer=Unit(), block_size=500).val np.testing.assert_array_almost_equal(k0, k2, decimal=10) from pysnptools.standardizer.identity import Identity from pysnptools.standardizer.diag_K_to_N import DiagKtoN for dtype in [sp.float64, sp.float32]: for std in [Unit(), Beta(1, 25), Identity(), DiagKtoN()]: s = str(std) np.random.seed(0) x = np.array(np.random.randint(3, size=[60, 100]), dtype=dtype) x2 = x[:, ::2] x2b = np.array(x2) #LATER what's this about? It doesn't do non-contiguous? #assert not x2.flags['C_CONTIGUOUS'] and not x2.flags['F_CONTIGUOUS'] #set up to test non contiguous #assert x2b.flags['C_CONTIGUOUS'] or x2b.flags['F_CONTIGUOUS'] #set up to test non contiguous #a,b = std.standardize(x2b),std.standardize(x2) #np.testing.assert_array_almost_equal(a,b) logging.info("done")
def __init__(self, GB_goal=None, force_full_rank=False, force_low_rank=False, snp_standardizer=Unit(), covariate_standardizer=Unit(), kernel_standardizer=DiagKtoN()): self.GB_goal = GB_goal self.force_full_rank = force_full_rank self.force_low_rank = force_low_rank self.snp_standardizer = snp_standardizer self.covariate_standardizer = covariate_standardizer self.kernel_standardizer = kernel_standardizer self.is_fitted = False
def _internal_single(K0, test_snps, pheno, covar, K1, mixing, h2, log_delta, cache_file, force_full_rank, force_low_rank, output_file_name, block_size, interact_with_snp, runner): assert K0 is not None, "real assert" assert K1 is not None, "real assert" assert block_size is not None, "real assert" assert mixing is None or 0.0 <= mixing <= 1.0 if force_full_rank and force_low_rank: raise Exception("Can't force both full rank and low rank") assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0/(np.exp(log_delta)+1) covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))] #view_ok because np.c_ will allocation new memory y = pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values if cache_file is not None and os.path.exists(cache_file): lmm = lmm_cov(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] h2 = data['arr_2'][0] mixing = data['arr_2'][1] else: K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN()) mixing = mixer.mixing if mixer.do_g: lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True) else: #print(covar.sum(),y.sum(),K.val.sum(),covar[0],y[0],K.val[0,0]) lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) lmm.getSU() np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write if interact_with_snp is not None: logging.info("interaction with %i" % interact_with_snp) assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range" interact = covar[:,interact_with_snp].copy() interact -=interact.mean() interact /= interact.std() else: interact = None work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up) # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function. def debatch_closure(work_index): return test_snps.sid_count * work_index // work_count def mapper_closure(work_index): if work_count > 1: logging.info("single_snp: Working on snp block {0} of {1}".format(work_index,work_count)) do_work_time = time.time() start = debatch_closure(work_index) end = debatch_closure(work_index+1) snps_read = test_snps[:,start:end].read().standardize() if interact_with_snp is not None: variables_to_test = snps_read.val * interact[:,np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) beta = res['beta'] chi2stats = beta*beta/res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] assert test_snps.iid_count == lmm.U.shape[0] p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals# dataframe = _create_dataframe(snps_read.sid_count) dataframe['sid_index'] = np.arange(start,end) dataframe['SNP'] = snps_read.sid dataframe['Chr'] = snps_read.pos[:,0] dataframe['GenDist'] = snps_read.pos[:,1] dataframe['ChrPos'] = snps_read.pos[:,2] dataframe['PValue'] = p_values dataframe['SnpWeight'] = beta[:,0] dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0]) dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0]) dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2 logging.info("time={0}".format(time.time()-do_work_time)) #logging.info(dataframe) return dataframe def reducer_closure(result_sequence): if output_file_name is not None: create_directory_if_necessary(output_file_name) frame = pd.concat(result_sequence) frame.sort_values(by="PValue", inplace=True) frame.index = np.arange(len(frame)) if output_file_name is not None: frame.to_csv(output_file_name, sep="\t", index=False) return frame frame = map_reduce(xrange(work_count), mapper=mapper_closure,reducer=reducer_closure, input_files=[test_snps],output_files=[output_file_name], name="single_snp(output_file={0})".format(output_file_name), runner=runner) return frame
f_handle.write("\n") #for i in range(1, 5): for i in range(1, res.shape[0]+1): ro.globalenv['i'] = i #keep_list2 = ro.r('keep_list2<-c(snp_list[i], keep_list)') #keep_list2 = ro.r('keep_list2<-c(snp_list2[snp_list2[,1]==snp_list2[snp_list2[,3]==snp_list[i],][1],3], keep_list)') keep_list2 = ro.r('keep_list2<-c(snp_list[i], colnames(X_data)[which(colnames(X_data)==snp_list[i])+1],colnames(X_data)[which(colnames(X_data)==snp_list[i])-1], keep_list)') G1 = np.array(ro.r('XX<-as.matrix(X_data[,colnames(X_data)%in%keep_list2])/(sqrt(length(keep_list2)))')) #norm_factor = 1./np.sqrt((G1**2).sum() / float(G1.shape[0])) #G1_standardized_val = norm_factor * G1 from pysnptools.standardizer import DiagKtoN G1_standardized_val = DiagKtoN(G1.shape[0]).standardize(G1) #G1_standardized_val = G1 lmmB = lmm W = G1_standardized_val.copy() UGup,UUGup = lmmB.rotate(W) i_up = np.zeros((W.shape[1]), dtype=np.bool) i_G1 = np.ones((W.shape[1]), dtype=np.bool) result = lmmB.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup) m2 = result['nLL'][0]*-1 if result['h2'] > -1: h2 = result['h2'] else: h2 = result['h2'][0] if result['h2_1'] > -1: h2_1 = result['h2_1']
def compute_core(input_tuple): """ Leave-two-chromosome-out evaluation scheme: Chr1: no causals, used for T1-error evaluation Chr2: has causals, not conditioned on, used for power evaluation Rest: has causals, conditioned on T1 Pow [ cond ] ===== ===== ===== .... ===== x x x x xx """ methods, snp_fn, eigen_fn, num_causal, num_pcs, seed, sim_id = input_tuple # partially load bed file from pysnptools.snpreader import Bed snp_reader = Bed(snp_fn) # determine indices for generation and evaluation ################################################################## chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest( snp_reader.pos) causal_candidates_idx = np.concatenate((chr2_idx, rest_idx)) # only compute t1-error (condition on all chr with causals on them) #causal_candidates_idx = rest_idx test_idx = np.concatenate((chr1_idx, chr2_idx)) if seed is not None: np.random.seed(int(seed % sys.maxint)) causal_idx = np.random.permutation(causal_candidates_idx)[0:num_causal] # generate phenotype ################################################################### genetic_var = 0.5 noise_var = 0.5 y = generate_phenotype( Bed(snp_fn).read(order='C').standardize(), causal_idx, genetic_var, noise_var) y.flags.writeable = False ############### only alter part until here --> modularize this # load pcs ################################################################### logging.info("loading eigendecomp from file %s" % eigen_fn) eig_dec = load(eigen_fn) G_pc = eig_dec["pcs"] G_pc.flags.writeable = False G_pc_ = G_pc[:, 0:num_pcs] G_pc_norm = DiagKtoN(G_pc_.shape[0]).standardize(G_pc_.copy()) G_pc_norm.flags.writeable = False # run feature selection ######################################################### # generate pheno data structure pheno = {"iid": snp_reader.iid, "vals": y, "header": []} covar = {"iid": snp_reader.iid, "vals": G_pc_norm, "header": []} # subset readers G0 = snp_reader[:, rest_idx] test_snps = snp_reader[:, test_idx] result = {} fs_result = {} # additional methods can be defined and included in the benchmark for method_function in methods: result_, fs_result_ = method_function(test_snps, pheno, G0, covar) result.update(result_) fs_result.update(fs_result_) # save indices indices = { "causal_idx": causal_idx, "chr1_idx": chr1_idx, "chr2_idx": chr2_idx, "input_tuple": input_tuple, "fs_result": fs_result } #test_idx return result, indices
def _internal_single( G0_standardized, test_snps, pheno, covar, G1_standardized, mixing, #!!test mixing and G1 h2, log_delta, cache_file, interact_with_snp=None): assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified" if log_delta is not None: h2 = 1.0 / (np.exp(log_delta) + 1) covar = np.hstack((covar['vals'], np.ones( (test_snps.iid_count, 1)))) #We always add 1's to the end. y = pheno['vals'] from pysnptools.standardizer import DiagKtoN assert mixing is None or 0.0 <= mixing <= 1.0 if cache_file is not None and os.path.exists(cache_file): lmm = fastLMM(X=covar, Y=y, G=None, K=None) with np.load(cache_file) as data: #!! similar code in epistasis lmm.U = data['arr_0'] lmm.S = data['arr_1'] else: # combine two kernels (normalize kernels to diag(K)=N G0_standardized_val = DiagKtoN( G0_standardized.val.shape[0]).standardize(G0_standardized.val) G1_standardized_val = DiagKtoN( G1_standardized.val.shape[0]).standardize(G1_standardized.val) if mixing == 0.0 or G1_standardized.sid_count == 0: G = G0_standardized.val elif mixing == 1.0 or G0_standardized.sid_count == 0: G = G1_standardized.val else: G = np.empty( (G0_standardized.iid_count, G0_standardized.sid_count + G1_standardized.sid_count)) if mixing is None: mixing, h2 = _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y) _mix(G, G0_standardized_val, G1_standardized_val, mixing) #TODO: make sure low-rank case is handled correctly lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True) if h2 is None: result = lmm.findH2() h2 = result['h2'] logging.info("h2={0}".format(h2)) snps_read = test_snps.read().standardize() if interact_with_snp is not None: print "interaction with %i" % interact_with_snp interact = covar[:, interact_with_snp] interact -= interact.mean() interact /= interact.std() variables_to_test = snps_read.val * interact[:, np.newaxis] else: variables_to_test = snps_read.val res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test) if cache_file is not None and not os.path.exists(cache_file): pstutil.create_directory_if_necessary(cache_file) np.savez( cache_file, lmm.U, lmm.S ) #using np.savez instead of pickle because it seems to be faster to read and write beta = res['beta'] chi2stats = beta * beta / res['variance_beta'] #p_values = stats.chi2.sf(chi2stats,1)[:,0] if G0_standardized is not None: assert G.shape[0] == lmm.U.shape[0] p_values = stats.f.sf( chi2stats, 1, lmm.U.shape[0] - 3 )[:, 0] #note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP) items = [('SNP', snps_read.sid), ('Chr', snps_read.pos[:, 0]), ('GenDist', snps_read.pos[:, 1]), ('ChrPos', snps_read.pos[:, 2]), ('PValue', p_values), ('SnpWeight', beta[:, 0]), ('SnpWeightSE', np.sqrt(res['variance_beta'][:, 0])), ('SnpFractVarExpl', np.sqrt(res['fraction_variance_explained_beta'][:, 0])), ('Nullh2', np.zeros((snps_read.sid_count)) + h2)] frame = pd.DataFrame.from_items(items) return frame