def lmm_from_cache_file(self): logging.info("Loading precomputation from {0}".format(self.cache_file)) lmm = LMM() with np.load(self.cache_file) as data: lmm.U = data['arr_0'] lmm.S = data['arr_1'] return lmm
def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().standardize().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists( self.cache_file ): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format( self.cache_file)) pstutil.create_directory_if_necessary(self.cache_file) np.savez( self.cache_file, lmm.U, lmm.S ) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count # As per the paper, we optimized delta with REML=True, but # we will later optimize beta and find log likelihood with ML (REML=False) result = lmm.find_log_delta( REML=True, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp( self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta))
def train_null(self): """ train model under null hypothesis """ logging.info("training null model") # use LMM self.lmm = LMM() self.lmm.setG(self.train_snps, self.train_pcs, a2=self.mixing) self.lmm.setX(self.cov) self.lmm.sety(self.phen) logging.info("finding delta") if self.delta is None: result = self.lmm.findH2(REML=self.REML, minH2=0.00001 ) self.delta = 1.0/result['h2']-1.0 # UX = lmm_null.U.dot(test_snps) self.res_null = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.ll_null = -self.res_null["nLL"]
def train_null(self): """ find delta on all snps """ logging.info("training null model") # use LMM self.lmm = LMM() self.lmm.setG(self.G0, self.G1, a2=self.mixing) self.lmm.setX(self.cov) self.lmm.sety(self.phen) logging.info("finding delta") #result = self.lmm.find_log_delta(self, self.N) #self.delta = np.exp(result['log_delta']) if self.delta is None: result = self.lmm.find_log_delta_chris() self.delta = result['delta']
def RealVar(self, y, X): lmmg = LMM() m = np.shape(X)[1] n = len(y) lmmg.setG(X / math.sqrt(m)) lmmg.sety(y) lmmg.setX(np.ones([n, 1])) try: dct = lmmg.findH2() except: dct = {} dct['h2'] = .5 mn = sum(y) / float(n) dct['sigma2'] = sum([(i - mn)**2 for i in y]) / float(n) h2 = dct['h2'] s2 = dct['sigma2'] sg2 = h2 * s2 se2 = s2 - sg2 return [se2, sg2]
def RealVar(self,y,X): lmmg=LMM() m=np.shape(X)[1]; n=len(y); lmmg.setG(X/math.sqrt(m)) lmmg.sety(y); lmmg.setX(np.ones([n,1])) try: dct=lmmg.findH2(); except: dct={}; dct['h2']=.5; mn=sum(y)/float(n); dct['sigma2']=sum([(i-mn)**2 for i in y])/float(n); h2=dct['h2']; s2=dct['sigma2']; sg2=h2*s2; se2=s2-sg2; return [se2,sg2];
def fill_in_cache_file(self): self._run_once() logging.info("filling in the cache_file and log_delta, as needed") if self.G1_or_none is None: self.G1val_or_none = None else: self.G1val_or_none = self.G1_or_none.read().val # The S and U are always cached, in case they are needed for the cluster or for multi-threaded runs if self.cache_file is None: self.cache_file = os.path.join(self.__tempdirectory, "cache_file.npz") if os.path.exists(self.cache_file): # If there is already a cache file in the temp directory, it must be removed because it might be out-of-date os.remove(self.cache_file) lmm = None if not os.path.exists(self.cache_file): logging.info("Precomputing eigen") lmm = LMM() G0_standardized = self.G0.read().standardize() lmm.setG(G0_standardized.val, self.G1val_or_none, a2=self.mixing) logging.info("Saving precomputation to {0}".format(self.cache_file)) util.create_directory_if_necessary(self.cache_file) np.savez(self.cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write if self.external_log_delta is None: if lmm is None: lmm = self.lmm_from_cache_file() logging.info("searching for internal delta") lmm.setX(self.covar) lmm.sety(self.pheno['vals']) #log delta is used here. Might be better to use findH2, but if so will need to normalized G so that its K's diagonal would sum to iid_count result = lmm.find_log_delta(REML=False, sid_count=self.G0.sid_count, min_log_delta=self.min_log_delta, max_log_delta=self.max_log_delta ) #!!what about findA2H2? minH2=0.00001 self.external_log_delta = result['log_delta'] self.internal_delta = np.exp(self.external_log_delta) * self.G0.sid_count logging.info("internal_delta={0}".format(self.internal_delta)) logging.info("external_log_delta={0}".format(self.external_log_delta))
class GwasPrototype(object): """ class to perform genome-wide scan """ def __init__(self, train_snps, test_snps, phen, delta=None, cov=None, REML=False, train_pcs=None, mixing=0.0): """ set up GWAS object """ self.REML = REML self.train_snps = train_snps self.test_snps = test_snps self.phen = phen if delta is None: self.delta=None else: self.delta = delta * train_snps.shape[1] self.n_test = test_snps.shape[1] self.n_ind = len(self.phen) self.train_pcs = train_pcs self.mixing = mixing # add bias if no covariates are used if cov is None: self.cov = np.ones((self.n_ind, 1)) else: self.cov = cov self.n_cov = self.cov.shape[1] self.lmm = None self.res_null = None self.res_alt = [] self.ll_null = None self.ll_alt = np.zeros(self.n_test) self.p_values = np.zeros(self.n_test) self.sorted_p_values = np.zeros(self.n_test) # merge covariates and test snps self.X = np.hstack((self.cov, self.test_snps)) def precompute_UX(self, X): ''' precompute UX for all snps to be tested -------------------------------------------------------------------------- Input: X : [N*D] 2-dimensional array of covariates -------------------------------------------------------------------------- ''' logging.info("precomputing UX") self.UX = self.lmm.U.T.dot(X) self.k = self.lmm.S.shape[0] self.N = self.lmm.X.shape[0] if (self.k<self.N): self.UUX = X - self.lmm.U.dot(self.UX) logging.info("done.") def train_null(self): """ train model under null hypothesis """ logging.info("training null model") # use LMM self.lmm = LMM() self.lmm.setG(self.train_snps, self.train_pcs, a2=self.mixing) self.lmm.setX(self.cov) self.lmm.sety(self.phen) logging.info("finding delta") if self.delta is None: result = self.lmm.findH2(REML=self.REML, minH2=0.00001 ) self.delta = 1.0/result['h2']-1.0 # UX = lmm_null.U.dot(test_snps) self.res_null = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.ll_null = -self.res_null["nLL"] def set_current_UX(self, idx): """ set the current UX to pre-trained LMM """ si = idx + self.n_cov self.lmm.X = np.hstack((self.X[:,0:self.n_cov], self.X[:,si:si+1])) self.lmm.UX = np.hstack((self.UX[:,0:self.n_cov], self.UX[:,si:si+1])) if (self.k<self.N): self.lmm.UUX = np.hstack((self.UUX[:,0:self.n_cov], self.UUX[:,si:si+1])) def train_alt(self): """ train alternative model """ assert self.lmm != None self.precompute_UX(self.X) for idx in xrange(self.n_test): self.set_current_UX(idx) res = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.res_alt.append(res) self.ll_alt[idx] = -res["nLL"] if idx % 1000 == 0: logging.info("processing snp {0}".format(idx)) def compute_p_values(self): """ given trained null and alt models, compute p-values """ # from C++ (?) #real df = rank_beta[ snp ] - ((real)1.0 * rank_beta_0[ snp ]) ; #pvals[ snp ] = PvalFromLikelihoodRatioTest( LL[ snp ] - LL_0[ snp ], ((real)0.5 * df) ); degrees_of_freedom = 1 assert len(self.res_alt) == self.n_test for idx in xrange(self.n_test): test_statistic = self.ll_alt[idx] - self.ll_null self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) self.p_idx = np.argsort(self.p_values) self.sorted_p_values = self.p_values[self.p_idx] def plot_result(self): """ plot results """ import pylab pylab.semilogy(self.p_values) pylab.show() dummy = [self.res_alt[idx]["nLL"] for idx in xrange(self.n_test)] pylab.hist(dummy, bins=100) pylab.title("neg likelihood") pylab.show() pylab.hist(self.p_values, bins=100) pylab.title("p-values") pylab.show() def run_gwas(self): """ invoke all steps in the right order """ self.train_null() self.train_alt() self.compute_p_values()
class WindowingGwas(object): """ class to perform genome-wide scan with single-snp windowing """ def __init__(self, G0, phen, delta=None, cov=None, REML=False, G1=None, mixing=0.0): """ set up GWAS object """ self.REML = REML self.G0 = G0 self.test_snps = G0 self.phen = phen if delta is None: self.delta = None else: self.delta = delta * G0.shape[1] self.n_test = self.test_snps.shape[1] self.n_ind = len(self.phen) self.G1 = G1 self.mixing = mixing # add bias if no covariates are used if cov is None: self.cov = np.ones((self.n_ind, 1)) else: self.cov = cov self.n_cov = self.cov.shape[1] self.lmm = None self.res_null = None self.res_alt = [] self.ll_null = np.zeros(self.n_test) self.ll_alt = np.zeros(self.n_test) self.p_values = np.zeros(self.n_test) self.sorted_p_values = np.zeros(self.n_test) # merge covariates and test snps self.X = np.hstack((self.cov, self.test_snps)) self.N = self.X.shape[0] def precompute_UX(self, X): ''' precompute UX for all snps to be tested -------------------------------------------------------------------------- Input: X : [N*D] 2-dimensional array of covariates -------------------------------------------------------------------------- ''' logging.info("precomputing UX") self.UX = self.lmm.U.T.dot(X) self.k = self.lmm.S.shape[0] self.N = self.lmm.X.shape[0] if (self.k < self.N): self.UUX = X - self.lmm.U.dot(self.UX) logging.info("done.") def train_null(self): """ find delta on all snps """ logging.info("training null model") # use LMM self.lmm = LMM() self.lmm.setG(self.G0, self.G1, a2=self.mixing) self.lmm.setX(self.cov) self.lmm.sety(self.phen) logging.info("finding delta") #result = self.lmm.find_log_delta(self, self.N) #self.delta = np.exp(result['log_delta']) if self.delta is None: result = self.lmm.find_log_delta_chris() self.delta = result['delta'] def set_current_UX(self, idx): """ set the current UX to pre-trained LMM """ si = idx + self.n_cov self.lmm.X = np.hstack((self.X[:, 0:self.n_cov], self.X[:, si:si + 1])) self.lmm.UX = np.hstack((self.UX[:, 0:self.n_cov], self.UX[:, si:si + 1])) if (self.k < self.N): self.lmm.UUX = np.hstack( (self.UUX[:, 0:self.n_cov], self.UUX[:, si:si + 1])) def set_null_UX(self): """ reset UX to covariates only """ self.lmm.X = self.X[:, 0:self.n_cov] self.lmm.UX = self.UX[:, 0:self.n_cov] if (self.k < self.N): self.lmm.UUX = self.UUX[:, 0:self.n_cov] def train_windowing(self): """ train null and alternative model """ assert self.lmm != None self.precompute_UX(self.X) for idx in range(self.n_test): #TODO: this can be generalized to bigger window self.lmm.set_exclude_idx([idx]) # null model self.set_null_UX() res = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.ll_null[idx] = -res["nLL"] # alternative model self.set_current_UX(idx) res = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.res_alt.append(res) self.ll_alt[idx] = -res["nLL"] if idx % 1000 == 0: logging.warning("processing snp {0}".format(idx)) def compute_p_values(self): """ given trained null and alt models, compute p-values """ degrees_of_freedom = 1 assert len(self.res_alt) == self.n_test for idx in range(self.n_test): test_statistic = self.ll_alt[idx] - self.ll_null[idx] self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) self.p_idx = np.argsort(self.p_values) self.sorted_p_values = self.p_values[self.p_idx] return self.p_values def plot_result(self): """ plot results """ import pylab pylab.semilogy(self.p_values) pylab.show() dummy = [self.res_alt[idx]["nLL"] for idx in range(self.n_test)] pylab.hist(dummy, bins=100) pylab.title("neg likelihood") pylab.show() pylab.hist(self.p_values, bins=100) pylab.title("p-values") pylab.show() def run_gwas(self): """ invoke all steps in the right order """ self.train_null() self.train_windowing() return self.compute_p_values()
def run_select(self, G0, G_bg, y, cov=None): """set up two kernel feature selection Parameters ---------- G0 : numpy array of shape (num_ind, num_snps) Data matrix from which foreground snps will be selected G0_bg : numpy array of shape (num_ind, num_snps) Data matrix containing background snps on which will be conditioned y : numpy vector of shape (num_ind, ) Vector of phenotypes cov : numpy array of shape (num_ind, num_covariates) or None Covariates to be used as fixed effects Returns ------- best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float) best_k is the best number of SNPs selected, feat_idx is a np.array of integers denoting the indices of these snps, best_mix is the best mixing coefficient between foreground and background kernel, best_delta is the best regularization coefficient """ num_ind = len(y) if cov is None: cov = np.ones((num_ind, 1)) else: logging.info("normalizing covariates") cov = cov.copy() cov = 1. / np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov cov.flags.writeable = False # normalize to diag(K) = N norm_factor = 1. / np.sqrt((G_bg**2).sum() / float(G_bg.shape[0])) # we copy in case G and G_bg are pointing to the same object G_bg = norm_factor * G_bg K_bg_full = G_bg.dot(G_bg.T) K_bg_full.flags.writeable = False # some asserts np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0]) if self.debug: norm_factor_check = 1. / np.sqrt(G_bg.shape[1]) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) for kfold_idx, (train_idx, test_idx) in enumerate( KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)): t0 = time.time() logging.info("running fold: %i" % kfold_idx) y_train = y.take(train_idx, axis=0) y_test = y.take(test_idx, axis=0) G0_train = G0.take(train_idx, axis=0) G0_test = G0.take(test_idx, axis=0) G_bg_train = G_bg.take(train_idx, axis=0) G_bg_test = G_bg.take(test_idx, axis=0) cov_train = cov.take(train_idx, axis=0) cov_test = cov.take(test_idx, axis=0) # write protect data y_train.flags.writeable = False y_test.flags.writeable = False G0_train.flags.writeable = False G0_test.flags.writeable = False G_bg_train.flags.writeable = False G_bg_test.flags.writeable = False cov_train.flags.writeable = False cov_test.flags.writeable = False # precompute background kernel K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) K_bg_train.flags.writeable = False if self.measure != "mse": K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1) K_bg_test.flags.writeable = False # rank features if self.order_by_lmm: logging.info("using linear mixed model to rank features") t0 = time.time() gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train) gwas.run_gwas() _pval = gwas.p_values logging.info("time taken: %s" % (str(time.time() - t0))) else: logging.info("using linear regression to rank features") _F, _pval = lin_reg.f_regression_block( lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train) feat_idx = np.argsort(_pval) for k_idx, max_k in enumerate(self.grid_k): feat_idx_subset = feat_idx[0:max_k] G_fs_train = G0_train.take(feat_idx_subset, axis=1) G_fs_test = G0_test.take(feat_idx_subset, axis=1) # normalize to sum(diag)=N norm_factor = 1. / np.sqrt( (G_fs_train**2).sum() / float(G_fs_train.shape[0])) G_fs_train *= norm_factor G_fs_test *= norm_factor G_fs_train.flags.writeable = False G_fs_test.flags.writeable = False # asserts if self.debug: norm_factor_check = 1.0 / np.sqrt(max_k) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) np.testing.assert_almost_equal( sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0]) logging.info("k: %i" % (max_k)) # use LMM from fastlmm.inference.lmm_cov import LMM as fastLMM if G_bg_train.shape[1] <= G_bg_train.shape[0]: lmm = fastLMM(X=cov_train, Y=y_train[:, np.newaxis], G=G_bg_train) else: lmm = fastLMM(X=cov_train, Y=y_train[:, np.newaxis], K=K_bg_train) W = G_fs_train.copy() UGup, UUGup = lmm.rotate(W) i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool) i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool) t0 = time.time() res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup) logging.info("time taken for k=%i: %s" % (max_k, str(time.time() - t0))) # recover a2 from alternate parameterization a2 = res["h2_1"] / float(res["h2"] + res["h2_1"]) h2 = res["h2"] + res["h2_1"] delta = (1 - h2) / h2 #res_cov = res # do final prediction using lmm.py from fastlmm.inference import LMM lmm = LMM(forcefullrank=False) lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2) lmm.setX(cov_train) lmm.sety(y_train) # we take an additional step to estimate betas on covariates (not given from new model) res = lmm.nLLeval(delta=delta, REML=True) # predict on test set lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test) out = lmm.predictMean(beta=res["beta"], delta=delta) mse = mean_squared_error(y_test, out) logging.info("mse: %f" % (mse)) self.mse[kfold_idx, k_idx] = mse self.mixes[kfold_idx, k_idx] = a2 self.deltas[kfold_idx, k_idx] = delta if self.measure != "mse": K_test_test = a2 * G_fs_test.dot( G_fs_test.T) + (1.0 - a2) * K_bg_test ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True) if self.debug: ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True) np.testing.assert_almost_equal(ll, ll2, decimal=4) logging.info("ll: %f" % (ll)) self.ll[kfold_idx, k_idx] = ll logging.info("time taken for fold: %s" % str(time.time() - t0)) best_k, best_mix, best_delta = self.select_best_k() logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta)) # final scan if self.order_by_lmm: logging.info("final scan using LMM") gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov) gwas.run_gwas() _pval = gwas.p_values feat_idx = np.argsort(_pval)[0:best_k] else: logging.info("final scan using LR") _F, _pval = lin_reg.f_regression_block( lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000) logging.info("number of snps selected: %i" % (best_k)) return best_k, feat_idx, best_mix, best_delta
class WindowingGwas(object): """ class to perform genome-wide scan with single-snp windowing """ def __init__(self, G0, phen, delta=None, cov=None, REML=False, G1=None, mixing=0.0): """ set up GWAS object """ self.REML = REML self.G0 = G0 self.test_snps = G0 self.phen = phen if delta is None: self.delta=None else: self.delta = delta * G0.shape[1] self.n_test = self.test_snps.shape[1] self.n_ind = len(self.phen) self.G1 = G1 self.mixing = mixing # add bias if no covariates are used if cov is None: self.cov = np.ones((self.n_ind, 1)) else: self.cov = cov self.n_cov = self.cov.shape[1] self.lmm = None self.res_null = None self.res_alt = [] self.ll_null = np.zeros(self.n_test) self.ll_alt = np.zeros(self.n_test) self.p_values = np.zeros(self.n_test) self.sorted_p_values = np.zeros(self.n_test) # merge covariates and test snps self.X = np.hstack((self.cov, self.test_snps)) self.N = self.X.shape[0] def precompute_UX(self, X): ''' precompute UX for all snps to be tested -------------------------------------------------------------------------- Input: X : [N*D] 2-dimensional array of covariates -------------------------------------------------------------------------- ''' logging.info("precomputing UX") self.UX = self.lmm.U.T.dot(X) self.k = self.lmm.S.shape[0] self.N = self.lmm.X.shape[0] if (self.k<self.N): self.UUX = X - self.lmm.U.dot(self.UX) logging.info("done.") def train_null(self): """ find delta on all snps """ logging.info("training null model") # use LMM self.lmm = LMM() self.lmm.setG(self.G0, self.G1, a2=self.mixing) self.lmm.setX(self.cov) self.lmm.sety(self.phen) logging.info("finding delta") #result = self.lmm.find_log_delta(self, self.N) #self.delta = np.exp(result['log_delta']) if self.delta is None: result = self.lmm.find_log_delta_chris() self.delta = result['delta'] def set_current_UX(self, idx): """ set the current UX to pre-trained LMM """ si = idx + self.n_cov self.lmm.X = np.hstack((self.X[:,0:self.n_cov], self.X[:,si:si+1])) self.lmm.UX = np.hstack((self.UX[:,0:self.n_cov], self.UX[:,si:si+1])) if (self.k<self.N): self.lmm.UUX = np.hstack((self.UUX[:,0:self.n_cov], self.UUX[:,si:si+1])) def set_null_UX(self): """ reset UX to covariates only """ self.lmm.X = self.X[:,0:self.n_cov] self.lmm.UX = self.UX[:,0:self.n_cov] if (self.k<self.N): self.lmm.UUX = self.UUX[:,0:self.n_cov] def train_windowing(self): """ train null and alternative model """ assert self.lmm != None self.precompute_UX(self.X) for idx in xrange(self.n_test): #TODO: this can be generalized to bigger window self.lmm.set_exclude_idx([idx]) # null model self.set_null_UX() res = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.ll_null[idx] = -res["nLL"] # alternative model self.set_current_UX(idx) res = self.lmm.nLLeval(delta=self.delta, REML=self.REML) self.res_alt.append(res) self.ll_alt[idx] = -res["nLL"] if idx % 1000 == 0: logging.warning("processing snp {0}".format(idx)) def compute_p_values(self): """ given trained null and alt models, compute p-values """ degrees_of_freedom = 1 assert len(self.res_alt) == self.n_test for idx in xrange(self.n_test): test_statistic = self.ll_alt[idx] - self.ll_null[idx] self.p_values[idx] = stats.chi2.sf(2.0 * test_statistic, degrees_of_freedom) self.p_idx = np.argsort(self.p_values) self.sorted_p_values = self.p_values[self.p_idx] return self.p_values def plot_result(self): """ plot results """ import pylab pylab.semilogy(self.p_values) pylab.show() dummy = [self.res_alt[idx]["nLL"] for idx in xrange(self.n_test)] pylab.hist(dummy, bins=100) pylab.title("neg likelihood") pylab.show() pylab.hist(self.p_values, bins=100) pylab.title("p-values") pylab.show() def run_gwas(self): """ invoke all steps in the right order """ self.train_null() self.train_windowing() return self.compute_p_values()
def run_select(self, G0, G_bg, y, cov=None): """set up two kernel feature selection Parameters ---------- G0 : numpy array of shape (num_ind, num_snps) Data matrix from which foreground snps will be selected G0_bg : numpy array of shape (num_ind, num_snps) Data matrix containing background snps on which will be conditioned y : numpy vector of shape (num_ind, ) Vector of phenotypes cov : numpy array of shape (num_ind, num_covariates) or None Covariates to be used as fixed effects Returns ------- best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float) best_k is the best number of SNPs selected, feat_idx is a np.array of integers denoting the indices of these snps, best_mix is the best mixing coefficient between foreground and background kernel, best_delta is the best regularization coefficient """ num_ind = len(y) if cov is None: cov = np.ones((num_ind,1)) else: logging.info("normalizing covariates") cov = cov.copy() cov = 1./np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov cov.flags.writeable = False # normalize to diag(K) = N norm_factor = 1./np.sqrt((G_bg**2).sum() / float(G_bg.shape[0])) # we copy in case G and G_bg are pointing to the same object G_bg = norm_factor * G_bg K_bg_full = G_bg.dot(G_bg.T) K_bg_full.flags.writeable = False # some asserts np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0]) if self.debug: norm_factor_check = 1./np.sqrt(G_bg.shape[1]) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) for kfold_idx, (train_idx, test_idx) in enumerate(KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)): t0 = time.time() logging.info("running fold: %i" % kfold_idx) y_train = y.take(train_idx, axis=0) y_test = y.take(test_idx, axis=0) G0_train = G0.take(train_idx, axis=0) G0_test = G0.take(test_idx, axis=0) G_bg_train = G_bg.take(train_idx, axis=0) G_bg_test = G_bg.take(test_idx, axis=0) cov_train = cov.take(train_idx, axis=0) cov_test = cov.take(test_idx, axis=0) # write protect data y_train.flags.writeable = False y_test.flags.writeable = False G0_train.flags.writeable = False G0_test.flags.writeable = False G_bg_train.flags.writeable = False G_bg_test.flags.writeable = False cov_train.flags.writeable = False cov_test.flags.writeable = False # precompute background kernel K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) K_bg_train.flags.writeable = False if self.measure != "mse": K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1) K_bg_test.flags.writeable = False # rank features if self.order_by_lmm: logging.info("using linear mixed model to rank features") t0 = time.time() gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train) gwas.run_gwas() _pval = gwas.p_values logging.info("time taken: %s" % (str(time.time()-t0))) else: logging.info("using linear regression to rank features") _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train) feat_idx = np.argsort(_pval) for k_idx, max_k in enumerate(self.grid_k): feat_idx_subset = feat_idx[0:max_k] G_fs_train = G0_train.take(feat_idx_subset, axis=1) G_fs_test = G0_test.take(feat_idx_subset, axis=1) # normalize to sum(diag)=N norm_factor = 1./np.sqrt((G_fs_train**2).sum() / float(G_fs_train.shape[0])) G_fs_train *= norm_factor G_fs_test *= norm_factor G_fs_train.flags.writeable = False G_fs_test.flags.writeable = False # asserts if self.debug: norm_factor_check = 1.0 / np.sqrt(max_k) np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1) np.testing.assert_almost_equal(sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0]) logging.info("k: %i" % (max_k)) # use LMM from fastlmm.inference.lmm_cov import LMM as fastLMM if G_bg_train.shape[1] <= G_bg_train.shape[0]: lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], G=G_bg_train) else: lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], K=K_bg_train) W = G_fs_train.copy() UGup,UUGup = lmm.rotate(W) i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool) i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool) t0 = time.time() res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup) logging.info("time taken for k=%i: %s" % (max_k, str(time.time()-t0))) # recover a2 from alternate parameterization a2 = res["h2_1"] / float(res["h2"] + res["h2_1"]) h2 = res["h2"] + res["h2_1"] delta = (1-h2) / h2 #res_cov = res # do final prediction using lmm.py from fastlmm.inference import LMM lmm = LMM(forcefullrank=False) lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2) lmm.setX(cov_train) lmm.sety(y_train) # we take an additional step to estimate betas on covariates (not given from new model) res = lmm.nLLeval(delta=delta, REML=True) # predict on test set lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test) out = lmm.predictMean(beta=res["beta"], delta=delta) mse = mean_squared_error(y_test, out) logging.info("mse: %f" % (mse)) self.mse[kfold_idx, k_idx] = mse self.mixes[kfold_idx, k_idx] = a2 self.deltas[kfold_idx, k_idx] = delta if self.measure != "mse": K_test_test = a2 * G_fs_test.dot(G_fs_test.T) + (1.0-a2) * K_bg_test ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True) if self.debug: ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True) np.testing.assert_almost_equal(ll, ll2, decimal=4) logging.info("ll: %f" % (ll)) self.ll[kfold_idx, k_idx] = ll logging.info("time taken for fold: %s" % str(time.time()-t0)) best_k, best_mix, best_delta = self.select_best_k() logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta)) # final scan if self.order_by_lmm: logging.info("final scan using LMM") gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov) gwas.run_gwas() _pval = gwas.p_values feat_idx = np.argsort(_pval)[0:best_k] else: logging.info("final scan using LR") _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000) logging.info("number of snps selected: %i" % (best_k)) return best_k, feat_idx, best_mix, best_delta