def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000): ''' In full rank case, loads up the SNPs in blocks, and construct the kernel. In low rank case, loads up all SNPs in to memory ''' if SNPs0 is None: return None, None if "K" in SNPs0: K0 = SNPs0["K"] G0 = None elif "data" in SNPs0: K0 = None G0 = SNPs0["data"]["snps"] else: #full rank if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0] SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank) K0 = SNPs0["K"] G0 = None else: #low rank K0 = None SNPs0["data"] = SNPs0["snp_set"].read() SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"]) G0 = SNPs0["data"]["snps"] #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion #exclude. So this is only for score test and lrt. if i_exclude is not None: if K0 is not None: #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster, #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now #where it is cubic in N in the outer loop only once K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank) K0 = K0 - K_up elif G0 is not None: G0 = G0[:,~i_exclude] num_snps = SNPs0["num_snps"] - i_exclude.sum() else: num_snps = SNPs0["num_snps"] #intersect data? #normalize: if K0 is not None: K0 = K0 / num_snps#K0.diagonal().mean() elif G0 is not None: G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; * means pointwise multiplication return G0, K0
def set_snps0(SNPs0,sample_size,i_exclude=None, forcefullrank=False,blocksize=10000): ''' In full rank case, loads up the SNPs in blocks, and construct the kernel. In low rank case, loads up all SNPs in to memory ''' if SNPs0 is None: return None, None if SNPs0.has_key("K"): K0 = SNPs0["K"] G0 = None elif SNPs0.has_key("data"): K0 = None G0 = SNPs0["data"]["snps"] else: #full rank if len(SNPs0["snp_set"]) > sample_size or forcefullrank:# N = Y.shape[0] SNPs0["K"] = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=SNPs0["snp_set"].to_index, blocksize=blocksize,allowlowrank=forcefullrank) K0 = SNPs0["K"] G0 = None else: #low rank K0 = None SNPs0["data"] = SNPs0["snp_set"].read() SNPs0["data"]["snps"] = up.standardize(SNPs0["data"]["snps"]) G0 = SNPs0["data"]["snps"] #lrt_up should never do exclusion, because set_snps0 should only get called once, in run_once, without exclusion #exclude. So this is only for score test and lrt. if i_exclude is not None: if K0 is not None: #Also note in the full rank case with exclusion, for score, one could in principle use low rank updates to make it faster, #when the number of excluded SNPs is small: it wold be cubic in num_excluded * num_inner*num_outer iterations, versus now #where it is cubic in N in the outer loop only once K_up = psd.build_kernel_blocked(snpreader=SNPs0["reader"], snp_idx=np.array(SNPs0["snp_set"].to_index)[i_exclude], blocksize=blocksize,allowlowrank=forcefullrank) K0 = K0 - K_up elif G0 is not None: G0 = G0[:,~i_exclude] num_snps = SNPs0["num_snps"] - i_exclude.sum() else: num_snps = SNPs0["num_snps"] #intersect data? #normalize: if K0 is not None: K0 = K0 / num_snps#K0.diagonal().mean() elif G0 is not None: G0 = G0 / np.sqrt( num_snps )#(G0*G0).mean() ) # computes the sqrt of the mean of the diagonal of K=GG^T; * means pointwise multiplication return G0, K0
def computePC(file, filepath = None, numpc = [5]): if filepath is not None: fn = os.path.join(filepath,file) else: fn = file if type(numpc) is int or type(numpc) is float: numpc = [numpc] alt_snpreader = Bed(fn) print "computing K" K = dist.build_kernel_blocked(fn,alt_snpreader=alt_snpreader) print "computing the Eigenvalue decomposition of K" [s_all,u_all] = LA.eigh(K) s_all=s_all[::-1] u_all=u_all[:,::-1] for numpcs in numpc: #import pdb; pdb.set_trace() print "saving %i PCs from %s" %(numpcs,fn) #import pdb; pdb.set_trace() s=s_all[0:numpcs] u = u_all[:,0:numpcs] outu = np.zeros((u_all.shape[0],numpcs+2),dtype = "|S20") outu[:,0:2] = alt_snpreader.original_iids outu[:,2::]=u fnout = getEigvecs_fn(fn,numpcs) np.savetxt(fnout,outu,fmt="%s",delimiter = "\t") fnout = "%s_pc%i.vals"%(fn,numpcs) #outs = np.zeros((s.shape[0],u.shape[1]+2),dtype = "|S20") np.savetxt(fnout,s,fmt="%.5f",delimiter = "\t") return s_all,u_all
def computePC(file, filepath=None, numpc=[5]): if filepath is not None: fn = os.path.join(filepath, file) else: fn = file if type(numpc) is int or type(numpc) is float: numpc = [numpc] alt_snpreader = Bed(fn) print "computing K" K = dist.build_kernel_blocked(fn, alt_snpreader=alt_snpreader) print "computing the Eigenvalue decomposition of K" [s_all, u_all] = LA.eigh(K) s_all = s_all[::-1] u_all = u_all[:, ::-1] for numpcs in numpc: #import pdb; pdb.set_trace() print "saving %i PCs from %s" % (numpcs, fn) #import pdb; pdb.set_trace() s = s_all[0:numpcs] u = u_all[:, 0:numpcs] outu = np.zeros((u_all.shape[0], numpcs + 2), dtype="|S20") outu[:, 0:2] = alt_snpreader.original_iids outu[:, 2::] = u fnout = getEigvecs_fn(fn, numpcs) np.savetxt(fnout, outu, fmt="%s", delimiter="\t") fnout = "%s_pc%i.vals" % (fn, numpcs) #outs = np.zeros((s.shape[0],u.shape[1]+2),dtype = "|S20") np.savetxt(fnout, s, fmt="%.5f", delimiter="\t") return s_all, u_all
def create_feature_selection_distributable(snp_reader, phen_fn, pc_fn, num_pcs_kernel, output_prefix, cov_fn=None, include_all=True): from fastlmm.feature_selection import FeatureSelectionStrategy import fastlmm.feature_selection.PerformSelectionDistributable as psd # set up parameters num_folds = 10 random_state = 42 num_snps_in_memory = 1000000 ############################## num_steps_delta = 7 num_steps_k = 7 num_steps_mix = 7 # log_2 space and all SNPs k_values = [ int(k) for k in np.logspace(0, 10, base=2, num=num_steps_k, endpoint=True) ] if include_all: k_values.append(snp_reader.sid_count) delta_values = np.logspace(-5, 10, endpoint=True, num=num_steps_delta, base=np.exp(1)) if pc_fn is None: assert num_pcs_kernel == 0 logging.info( "feature selection: no PCs specified, disabling loop over mixing parameter" ) strategy = "insample_cv" select_by_ll = True # go! feature_selector = FeatureSelectionStrategy( snp_reader, phen_fn, num_folds, random_state=random_state, num_snps_in_memory=num_snps_in_memory, interpolate_delta=False, cov_fn=cov_fn) perform_selection_distributable = psd.PerformSelectionDistributable( feature_selector, k_values, delta_values, strategy, output_prefix, select_by_ll) return perform_selection_distributable
def perform_selection(self, k_values, delta_values, strategy="lmm_full_cv", output_prefix=None, select_by_ll=False, runner=Local(),penalty=0.0,create_pdf=True): """Perform feature selection k_values : array-like, shape = [n_steps_k] Array of k values to test delta_values : array-like, shape = [n_steps_delta] Array of delta values to test strategy : {'lmm_full_cv', 'insample_cv'} Strategy to perform feature selection: - 'lmm_full_cv' perform cross-validation over grid of k and delta using LMM - 'insample_cv' perform cross-validation over grid of k, estimate delta in sample using maximum likelihood. output_prefix : str, optional, default=None Prefix for output files select_by_ll : bool, default=False If set to True, negative log-likelihood will be used to select best k and delta Returns ------- best_k : int best subset size k best_delta : float best regularization parameter delta for ridge regression best_obj : float best objective at optimum (default MSE, nLL if select_by_ll flag is set), best_snps : list[str] list of ids of best snps (univariate selection done on whole data set using best_k, best_delta) """ with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _: self.biggest_k = max(k_values) if (strategy!="lmm_full_cv") and (strategy!="insample_cv"): logging.warning("strategies other than lmm_full_cv and insample_cv are experimental!") raise Exception("strategies other than lmm_full_cv and insample_cv are experimental!") perform_selection_distributable = psd.PerformSelectionDistributable(self, k_values, delta_values, strategy, output_prefix, select_by_ll, penalty=penalty,create_pdf=create_pdf) result = runner.run(perform_selection_distributable) return result
def blocking(self, snpreader, cov_fn=None, num_pcs=0, output_prefix = None, strategy="lmm_full_cv"): """ compare three different cases: To control memory use, we've introduced a parameter called "num_snps_in_memory", which defaults to 100000. Here are the interesting cases to consider (and choose num_snps_in_memory accordingly): 1) num_snps_in_memory > total_num_snps In this case, the same code as before should be executed (except the kernel matrix on all SNPs is now cached). 2) num_snps_in_memory < total_num_snps num_snps_in_memory > k (excluding all_snps) Here, the linear regression will be blocked, while the data for cross-validation is cached, saving time for loading and re-indexing. 3) num_snps_in_memory < total_num_snps num_snps_in_memory < k (excluding all_snps) Finally, both operations - linear regression and building the kernel will be blocked. 4,5,6) Same as #1,2,3, but with a phenos that has extra iids and for which the iids are shuffled. """ # set up grid ############################## num_steps_delta = 5 num_folds = 2 # log_2 space and all SNPs k_values = [0, 1, 5, 10, 100, 500, 700, 10000] delta_values = np.logspace(-3, 3, endpoint=True, num=num_steps_delta, base=np.exp(1)) random_state = 42 # case 1 fss_1 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000,count_A1=False) best_k_1, best_delta_1, best_obj_1, best_snps_1 = fss_1.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False) #some misc testing from fastlmm.feature_selection import PerformSelectionDistributable as psd perform_selection_distributable = psd.PerformSelectionDistributable(fss_1, k_values, delta_values, strategy, output_prefix, select_by_ll=True, penalty=0.0,create_pdf=False) self.assertEqual(perform_selection_distributable.work_count, 3) s = perform_selection_distributable.tempdirectory s = str(perform_selection_distributable) s = "%r" % perform_selection_distributable from fastlmm.feature_selection.feature_selection_cv import GClass s = "%r" % GClass.factory(snpreader,1000000, Unit(), 50,count_A1=False) s = s #!!making test for each break point. # case 2 fss_2 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000,count_A1=False) best_k_2, best_delta_2, best_obj_2, best_snps_2 = fss_2.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False) # case 3 fss_3 = FeatureSelectionStrategy(snpreader, self.pheno_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600,count_A1=False) best_k_3, best_delta_3, best_obj_3, best_snps_3 = fss_3.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False) # case 4 fss_4 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=20000,count_A1=False) best_k_4, best_delta_4, best_obj_4, best_snps_4 = fss_4.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False) # case 5 fss_5 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=5000,count_A1=False) best_k_5, best_delta_5, best_obj_5, best_snps_5 = fss_5.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False) # case 6 fss_6 = FeatureSelectionStrategy(snpreader, self.pheno_shuffleplus_fn, num_folds, cov_fn=cov_fn, random_state=random_state, num_pcs=num_pcs, interpolate_delta=True, num_snps_in_memory=600,count_A1=False) best_k_6, best_delta_6, best_obj_6, best_snps_6 = fss_6.perform_selection(k_values, delta_values, output_prefix=output_prefix, select_by_ll=True, strategy=strategy,create_pdf=False) self.assertEqual(int(best_k_1), int(best_k_2)) self.assertEqual(int(best_k_1), int(best_k_3)) #self.assertEqual(int(best_k_1), int(best_k_4)) #self.assertEqual(int(best_k_1), int(best_k_5)) #self.assertEqual(int(best_k_1), int(best_k_6)) self.assertAlmostEqual(best_obj_1, best_obj_2) self.assertAlmostEqual(best_obj_1, best_obj_3) #self.assertAlmostEqual(best_obj_1, best_obj_4) self.assertAlmostEqual(best_obj_4, best_obj_5) self.assertAlmostEqual(best_obj_4, best_obj_6) if strategy != "insample_cv": self.assertAlmostEqual(best_delta_1, best_delta_2) self.assertAlmostEqual(best_delta_1, best_delta_3) #self.assertAlmostEqual(best_delta_1, best_delta_4) self.assertAlmostEqual(best_delta_4, best_delta_5) self.assertAlmostEqual(best_delta_4, best_delta_6)