def run_lmm(use_kinship, peer_cov, Xc, Y, cov, K): '''this functions computes the lmm model using X (genotype), Y(phenotype). Optional: cov(matrix of covariance), K(kinship)''' if use_kinship == 'y': if peer_cov == 'n': #if no covariates were used with peer then account for cov in the model sys.stderr.write( '\nrunning lmm = QTL.test_lmm(Xc,Y,covs=cov,K=K)...') lmm = QTL.test_lmm(Xc, Y, covs=cov, K=K) else: sys.stderr.write('\nrunning lmm = QTL.test_lmm(Xc,Y,K=K)...') lmm = QTL.test_lmm( Xc, Y, K=K ) # otherwise exclude covariates from the model since already used in peer else: #no kinship if peer_cov == 'n': #if no cov where used with peer then sys.stderr.write( '\nrunning lmm= QTL.test_lmm(Xc,Y,covs=cov,K=SP.eye(Xc.shape[0]))...' ) lmm = QTL.test_lmm(Xc, Y, covs=cov, K=SP.eye( Xc.shape[0])) #include covariates in the model else: sys.stderr.write( '\nrunning lmm = QTL.test_lmm(Xc,Y,K=SP.eye(Xc.shape[0]))...') lmm = QTL.test_lmm( Xc, Y, K=SP.eye(Xc.shape[0]) ) #exclude cov in the model since already used by peer return lmm
def run_lmm(ts, RNA, COV, i, transK, out_csv): logger = logging.getLogger() y = RNA[:, i] # Mixed model logger.debug('Start lmm') lmm = qtl.test_lmm(snps=ts, pheno=y, K=transK, covs=COV, verbose=True) logger.debug('Done lmm') pvalues_lmm = pd.DataFrame(data=lmm.getPv().T, index=range(0, ts.shape[1]), columns=['lmm']) # Linear regression model logger.debug('Start lm') lm = qtl.test_lmm(snps=ts, pheno=y, covs=COV, verbose=True) logger.debug('Done lm') pvalues_lm = pd.DataFrame(data=lm.getPv().T, index=range(0, ts.shape[1]), columns=['lm']) # Export logger.debug('export') pval = pd.concat([pvalues_lmm, pvalues_lm], axis=1) #np.savetxt(Out+'/'+str(i)+'.csv', pval, delimiter=',', fmt='%.6e') np.savetxt(out_csv, pval, delimiter='\t', fmt='%.6e')
def run_lmm(ts,RNA,COV,i,transK,out_csv): y=RNA[:,i] # Mixed model lmm=qtl.test_lmm(snps=ts, pheno=y,K=transK, covs=COV) pvalues_lmm=pd.DataFrame(data=lmm.getPv().T, index=range(0,ts.shape[1]), columns=['lmm']) # Linear regression model lm=qtl.test_lmm(snps=ts, pheno=y, covs=COV) pvalues_lm=pd.DataFrame(data=lm.getPv().T, index=range(0,ts.shape[1]), columns=['lm']) # Export pval=pd.concat([pvalues_lmm,pvalues_lm], axis=1) #np.savetxt(Out+'/'+str(i)+'.csv', pval, delimiter=',', fmt='%.6e') np.savetxt(out_csv, pval, delimiter='\t', fmt='%.6e')
def test_lmm_lr(G, y, Z, Kbg, Covs=None): """ low-rank lmm input: G : genotypes y : phenotype Z : features of low-rank matrix Kbg : background covariance matrix Covs : fixed effect covariates """ vd = varianceDecomposition.VarianceDecomposition(y) if Covs is not None: vd.addFixedEffect(Covs) vd.addRandomEffect(Kbg) Klr = utils.computeLinearKernel(Z) vd.addRandomEffect(Klr) vd.addRandomEffect(is_noise=True) vd.optimize() varComps = vd.getVarianceComps()[0] Ktotal = varComps[0] * Kbg + varComps[1] * Klr lm = qtl.test_lmm(G, y, covs=Covs, K=Ktotal) pv = lm.getPv()[0] beta = lm.getBetaSNP()[0] var_snps = beta**2 * np.var(G, axis=0) var_genes = np.zeros(len(beta)) + varComps[1] var_covs = np.zeros(len(beta)) if Covs is not None: var_covs += np.dot(Covs, vd.getWeights()).var() return pv, beta, var_snps, var_covs, var_genes
def test_lmm_lr(G, y, Z, Kbg, Covs=None): """ low-rank lmm input: G : genotypes y : phenotype Z : features of low-rank matrix Kbg : background covariance matrix Covs : fixed effect covariates """ vd = varianceDecomposition.VarianceDecomposition(y) if Covs is not None: vd.addFixedEffect(Covs) vd.addRandomEffect(Kbg) Klr = utils.computeLinearKernel(Z) vd.addRandomEffect(Klr) vd.addRandomEffect(is_noise=True) vd.optimize() varComps = vd.getVarianceComps()[0] Ktotal = varComps[0]*Kbg + varComps[1]*Klr lm = qtl.test_lmm(G,y,covs=Covs,K=Ktotal) pv = lm.getPv()[0] beta = lm.getBetaSNP()[0] var_snps = beta**2 * np.var(G,axis=0) var_genes = np.zeros(len(beta)) + varComps[1] var_covs = np.zeros(len(beta)) if Covs is not None: var_covs += np.dot(Covs, vd.getWeights()).var() return pv, beta, var_snps, var_covs, var_genes
def initial_scan(self, startSnpIdx=0, nSnps=np.inf, memory_efficient=False): """ running initial scan using a linear mixed model input: startSnpIdx : index of first snp (default : 0) nSnps : number of SNPs to use (default: infinite) memory_efficient : if turned on (default: false), phenotype are processed sequentially, leading to longer runtime but less memory. """ F = self.genoreader.get_nrows() T = self.phenoreader.get_nrows() N = self.genoreader.get_ncols() if ~np.isfinite(nSnps): nSnps = F nSnps = min(nSnps, F - startSnpIdx) G = self.genoreader.loadSnpBlock(startSnpIdx, nSnps) if memory_efficient: pv = np.zeros((nSnps, T)) beta = np.zeros((nSnps, T)) for t in range(T): y = self.phenoreader.getRows([t]).T lm = qtl.test_lmm(snps=G.T, pheno=y, K=self.K, covs=self.Covs) pv[:, t] = lm.getPv()[0] beta[:, t] = lm.getBetaSNP()[0] else: Y = self.phenoreader.getMatrix() lm = qtl.test_lmm(snps=G.T, pheno=Y.T, K=self.K, covs=self.Covs) pv = lm.getPv().T beta = lm.getBetaSNP().T self.assoc0_reader = reader.MatrixReader(pv) return beta, pv
def run_lmm_chunk(ts,RNA,COV,RNA_start,RNA_end,RNA_columns,transK,out_csv): logger = logging.getLogger() y=RNA[:,RNA_start:RNA_end] # Mixed model logger.debug('Start lmm') lmm=qtl.test_lmm(snps=ts, pheno=y,K=transK, covs=COV,verbose=True) logger.debug('Done lmm') pvalues_lmm=pd.DataFrame(data=lmm.getPv().T, index=range(0,ts.shape[1]), columns=RNA_columns) logger.debug('Export') #np.savetxt(out_csv, pvalues_lmm, delimiter='\t', fmt='%.6e') pvalues_lmm.to_csv(out_csv,sep='\t',header=True,index=False)
def initial_scan(self, startSnpIdx=0, nSnps=np.inf, memory_efficient=False): """ running initial scan using a linear mixed model input: startSnpIdx : index of first snp (default : 0) nSnps : number of SNPs to use (default: infinite) memory_efficient : if turned on (default: false), phenotype are processed sequentially, leading to longer runtime but less memory. """ F = self.genoreader.get_nrows() T = self.phenoreader.get_nrows() N = self.genoreader.get_ncols() if ~np.isfinite(nSnps): nSnps = F nSnps = min(nSnps, F - startSnpIdx) G = self.genoreader.loadSnpBlock(startSnpIdx, nSnps) if memory_efficient: pv = np.zeros((nSnps, T)) beta = np.zeros((nSnps, T)) for t in range(T): y = self.phenoreader.getRows([t]).T lm = qtl.test_lmm(snps=G.T, pheno=y, K=self.K, covs=self.Covs) pv[:, t] = lm.getPv()[0] beta[:, t] = lm.getBetaSNP()[0] else: Y = self.phenoreader.getMatrix() lm = qtl.test_lmm(snps=G.T, pheno=Y.T, K=self.K, covs=self.Covs) pv = lm.getPv().T beta = lm.getBetaSNP().T self.assoc0_reader = reader.MatrixReader(pv) return beta, pv
def fitLMM(self, K=None, tech_noise=None, idx=None, i0=None, i1=None, verbose=False): """ Args: K: list of random effects to be considered in the analysis if K is none, it does not consider any random effect idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops verbose: if True, print progresses Returns: pv: matrix of pvalues beta: matrix of correlations info: dictionary annotates pv and beta rows and columns, containing gene_idx_row: index of the genes in rows conv: boolean vetor marking genes for which variance decomposition has converged gene_row: annotate rows of matrices """ assert self.var is not None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method' # print QTL if idx is None: if i0 is None or i1 is None: i0 = 0 i1 = self.G idx = SP.arange(i0, i1) elif not isinstance(idx, SP.ndarray): idx = SP.array([idx]) if K is not None and not isinstance(K, list): K = [K] lmm_params = { 'covs': SP.ones([self.N, 1]), 'NumIntervalsDeltaAlt': 100, 'NumIntervalsDelta0': 100, 'searchDelta': True } Ystd = self.Y - self.Y.mean(0) Ystd /= self.Y.std(0) beta = SP.zeros((idx.shape[0], self.G)) pv = SP.zeros((idx.shape[0], self.G)) geneID = SP.zeros(idx.shape[0], dtype=str) count = 0 var = self.var / self.var.sum(1)[:, SP.newaxis] for ids in idx: if verbose: print('.. fitting gene %d' % ids) # extract a single gene if K is not None: if len(K) > 1: if self.var_info['conv'][count] == True: _K = SP.sum( [var[count, i] * K[i] for i in range(len(K))], 0) _K /= _K.diagonal().mean() else: _K = None else: _K = K[0] else: _K = None lm = QTL.test_lmm(Ystd, Ystd[:, ids:ids + 1], K=_K, verbose=False, **lmm_params) pv[count, :] = lm.getPv()[0, :] beta[count, :] = lm.getBetaSNP()[0, :] if self.geneID is not None: geneID[count] = self.geneID[ids] count += 1 info = {'conv': self.var_info['conv'], 'gene_idx_row': idx} if geneID is not None: info['gene_row'] = geneID return pv, beta, info
snp_pos = info_df['pos'].values snp_idx = np.logical_and((snp_pos > min_pos), (snp_pos < max_pos)) if not snp_idx.any(): continue X_gene = X[:, snp_idx] if permute: X_gene = np.random.permutation(X_gene) info_gene = info_df.iloc[snp_idx] for item in info_gene.columns: out_dict[item] = info_gene[item].values assoc_gene = gene.repeat(snp_idx.sum()) out_dict['assoc_gene'] = assoc_gene # Run the LMM analysis print " .. single trait analysis" if fit_design: lmm = QTL.test_lmm(X_gene, Y_gene, K=K, covs=design) else: lmm = QTL.test_lmm(X_gene, Y_gene, K=K) pv = lmm.getPv() pv[0][np.isnan(pv[0])] = 1.0 # set any NaN p-values to 1 out_dict['pv'] = pv[0] out_dict['qv'] = FDR.qvalues(pv)[0] out_dict['beta'] = lmm.getBetaSNP()[0] lambda_val = getLambda(pv) lambda_val = lambda_val.repeat(len(out_dict['pv'])) out_dict['lambda_val'] = lambda_val out_df = pd.DataFrame(out_dict, index=out_dict['gdid']) ## convert full stops in gene name to underscore gene = gene.replace(".", "_") ## append results for chunk to gene's results df to HDF5 file print " ....appending results..."
print ' .. Importing data' try: Xc, info = data.getGenotypes(gene, return_info=True) except: print 'Error: no SNPs found in cis' continue Y = data.getPhenotypes(gene, peer=opt.peer, gauss=True) o = gene_group.create_group('snp_info') smartDumpDictHdf5(info, o) if opt.perm: if opt.seed is not None: sp.random.seed(opt.seed) idxs = sp.random.permutation(Xc.shape[0]) Xc = Xc[idxs, :] if 1: print " .. single trait analysis" lmm = QTL.test_lmm(Xc, Y, K=K) pv = lmm.getPv() RV = {} RV['pv'] = pv RV['qv'] = FDR.qvalues(pv) RV['beta'] = lmm.getBetaSNP() RV['lambda'] = getLambda(pv) o = gene_group.create_group('st') smartDumpDictHdf5(RV, o) fout.close()
def main(): geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:] #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' #norm_mode = 'RIN' #out_dir = '.' #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5' #RNA_start,RNA_end = 0,2 RNA_start,RNA_end = int(RNA_start),int(RNA_end) make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir,'logs')) logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)), file_level=logging.DEBUG,console_level=logging.DEBUG) LoggerFactory.log_command(logger,sys.argv[1:]) logger.info('Output directory: %s',out_dir) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results')) logger.info('Loading genotype from %s',geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s',pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) logger.info('Loading sample relatedness from %s',panama_file) panama_f = h5py.File(panama_file,'r') Ktot = panama_f['Ktot'][:] # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID) logger.info('Normalization: %s',norm_mode) if norm_mode=='None': phenotype_vals = phenotypes.values elif norm_mode=='RIN': phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values) elif norm_mode=='boxcox': phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values) else: logger.info('Normalization mode %s is not recognized. Exit',norm_mode) N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1]#number of phenotypes logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N,S,P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) #logger.info('Start testing: LM') #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, # covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, # columns=phenotype_ID) logger.info('Start testing: LMM') #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, # K=sample_relatedness,covs=cov,verbose=True) lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals, K=Ktot,covs=None,verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Saving P-values to text file') #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), # header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID), header=True,index=False) # Genome-wide manhatton plots for one phenotype: logger.info('Plotting Manhattan plots') manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[12,8]) #subpl = plt.subplot(2,1,1) #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) #plt.title('%s, LM'%p_ID) #subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LMM'%p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype logger.info('Plotting phenotype vs. SNP') snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure #find maximum squared beta value pheno_vals, s_idx = dataset.getPhenotypes([p_ID]) imax = lmm.pvalues[ip].argmin() i_0 = snps[s_idx,imax]==0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5,2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams logger.info('Plotting P-value histograms') pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) #subpl = plt.subplot(1,2,1) #plt.hist(pvalues_lm[p_ID].values,20,normed=True) #plt.plot([0,1],[1,1],"r") #plt.title("%s, LM" % p_ID) #plt.xlabel("P-value") #plt.ylabel("Frequency") #subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots logger.info('Plotting Q-Q plots') qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) #subpl = plt.subplot(1,2,1) #qqplot(pvalues_lm[p_ID].values) #plt.title("%s, LM" % p_ID) #subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot #logger.info('Plotting LM vs LMM P-values') #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): # out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) # fig = plt.figure(figsize=[3,3]) # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') # ymax = max(plt.xlim()[1],plt.ylim()[1]) # plt.plot([0,ymax],[0,ymax],'k--') # plt.xlabel('LM') # plt.ylabel('LMM') # plt.title(p_ID) # fig.savefig(out_file) # plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
ts = (ts-ts.mean(axis=0))/ts.std(axis=0) transk = np.dot(ts,ts.T) #transk ## Scaling Kinship matrix (from the Bjarni's scale_k()) c = sp.sum((sp.eye(len(transk)) - (1.0 / len(transk)) * sp.ones(transk.shape)) * sp.array(transk)) scalar = (len(transk) - 1) / c transK = scalar * transk RNA = RNA[:500,:] ts = ts[:500,:] transK = transK[:500,:500] for i in range(0, RNA.shape[1]): y=RNA[:,i] # Mixed model lmm=qtl.test_lmm(snps=ts, pheno=y,K=transK) pvalues_lmm=pd.DataFrame(data=lmm.getPv().T, index=range(0,ts.shape[1]), columns=['lmm']) # Linear regression model lm=qtl.test_lmm(snps=ts, pheno=y) pvalues_lm=pd.DataFrame(data=lm.getPv().T, index=range(0,ts.shape[1]), columns=['lm']) # Export pval=pd.concat([pvalues_lmm,pvalues_lm], axis=1) np.savetxt(Out+'/'+str(i)+'.csv', pval, delimiter=',', fmt='%.3e')
def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True): """ Args: K: list of random effects to be considered in the analysis if K is none, it does not consider any random effect expr: correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops verbose: if True, print progress recalc: if True, re-do variance decomposition standardize: if True, standardize also expression Returns: pv: matrix of pvalues beta: matrix of correlations info: dictionary annotates pv and beta rows and columns, containing gene_idx_row: index of the genes in rows conv: boolean vetor marking genes for which variance decomposition has converged gene_row: annotate rows of matrices """ if idx==None: if i0==None or i1==None: i0 = 0; i1 = self.G idx = SP.arange(i0,i1) elif type(idx)!=SP.ndarray: idx = SP.array(idx) idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell if K!=None: if type(K)!=list: K = [K] if (recalc==True and len(K)>1) or (recalc==True and self.var==None): print 'performing variance decomposition first...' var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False) var = var_raw/var_raw.sum(1)[:,SP.newaxis] elif recalc==False and len(K)>1: assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method' warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.') var_raw = self.var var_info = self.var_info var = var_raw/var_raw.sum(1)[:,SP.newaxis] lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True} Yidx = self.Y[:,idx] Ystd = Yidx-Yidx.mean(0) Ystd/= Yidx.std(0) #delta optimization might be more efficient if expr==None: expr = Ystd elif standardize==True: exprStd = expr exprStd = expr-expr.mean(0) exprStd/= expr.std(0) expr = exprStd _G1 = idx.shape[0] _G2 = expr.shape[1] geneID = SP.zeros(_G1,dtype=str) beta = SP.zeros((_G1,_G2)) pv = SP.zeros((_G1,_G2)) count = 0 for ids in range(_G1): if verbose: print '.. fitting gene %d'%ids # extract a single gene if K!=None: if len(K)>1: if var_info['conv'][count]==True: _K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0) _K/= _K.diagonal().mean() else: _K = None else: _K = K[0] else: _K = None lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,**lmm_params) pv[count,:] = lm.getPv()[0,:] beta[count,:] = lm.getBetaSNP()[0,:] count+=1 if self.geneID!=None: geneID = SP.array(self.geneID)[idx] if recalc==True and K!=None and len(K)>1: info = {'conv':var_info['conv'],'gene_idx_row':idx} else: info = {'gene_idx_row':idx} if geneID!=None: info['gene_row'] = geneID return pv, beta, info
snps = data_subsample.getGenotypes(impute_missing=True) phenotypes,sample_idx = data_subsample.getPhenotypes(phenotype_query=phenotype_query,intersection=True); assert sample_idx.all() sample_relatedness = data_subsample.getCovariance() position = data_subsample.getPos() #set parameters for the analysis N, P = phenotypes.shape covs = None #covariates searchDelta = False #specify if delta should be optimized for each SNP test="lrt" #specify type of statistical test # Running the analysis # when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1))) lmm = QTL.test_lmm(snps=snps,pheno=phenotypes.values,K=sample_relatedness,covs=covs,test=test) pvalues = lmm.getPv() # 1xS vector of p-values (S=X.shape[1]) #convert P-values to a DataFrame for nice output writing: pvalues = pd.DataFrame(data=pvalues.T,index=data_subsample.geno_ID,columns=phenotypes.columns) pvalues = pd.concat([position,pvalues],join="outer",axis=1) betas = lmm.getBetaSNP() # 1xS vector of effect sizes (S=X.shape[1]) #convert betas to a DataFrame for nice output writing: betas = pd.DataFrame(data=betas.T,index=data_subsample.geno_ID,columns=phenotypes.columns) betas = pd.concat([position,pvalues],join="outer",axis=1) #create result DataFrame result["pvalues"] = pvalues result["betas"] = betas
def fitLMM(self,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False): """ Args: K: list of random effects to be considered in the analysis if K is none, it does not consider any random effect idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops verbose: if True, print progresses Returns: pv: matrix of pvalues beta: matrix of correlations info: dictionary annotates pv and beta rows and columns, containing gene_idx_row: index of the genes in rows conv: boolean vetor marking genes for which variance decomposition has converged gene_row: annotate rows of matrices """ assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method' # print QTL if idx==None: if i0==None or i1==None: i0 = 0; i1 = self.G idx = SP.arange(i0,i1) elif type(idx)!=SP.ndarray: idx = SP.array([idx]) if K!=None and type(K)!=list: K = [K] lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True} Ystd = self.Y-self.Y.mean(0) Ystd/= self.Y.std(0) beta = SP.zeros((idx.shape[0],self.G)) pv = SP.zeros((idx.shape[0],self.G)) geneID = SP.zeros(idx.shape[0],dtype=str) count = 0 var = self.var/self.var.sum(1)[:,SP.newaxis] for ids in idx: if verbose: print '.. fitting gene %d'%ids # extract a single gene if K!=None: if len(K)>1: if self.var_info['conv'][count]==True: _K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0) _K/= _K.diagonal().mean() else: _K = None else: _K = K[0] else: _K = None lm = QTL.test_lmm(Ystd,Ystd[:,ids:ids+1],K=_K,verbose=False,**lmm_params) pv[count,:] = lm.getPv()[0,:] beta[count,:] = lm.getBetaSNP()[0,:] if self.geneID!=None: geneID[count] = self.geneID[ids] count+=1 info = {'conv':self.var_info['conv'],'gene_idx_row':idx} if geneID!=None: info['gene_row'] = geneID return pv, beta, info
sample_relatedness = data_subsample.getCovariance() position = data_subsample.getPos() #set parameters for the analysis N, P = phenotypes.shape covs = None #covariates searchDelta = False #specify if delta should be optimized for each SNP test = "lrt" #specify type of statistical test # Running the analysis # when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1))) lmm = QTL.test_lmm(snps=snps, pheno=phenotypes.values, K=sample_relatedness, covs=covs, test=test) pvalues = lmm.getPv() # 1xS vector of p-values (S=X.shape[1]) #convert P-values to a DataFrame for nice output writing: pvalues = pd.DataFrame(data=pvalues.T, index=data_subsample.geno_ID, columns=phenotypes.columns) pvalues = pd.concat([position, pvalues], join="outer", axis=1) betas = lmm.getBetaSNP() # 1xS vector of effect sizes (S=X.shape[1]) #convert betas to a DataFrame for nice output writing: betas = pd.DataFrame(data=betas.T, index=data_subsample.geno_ID, columns=phenotypes.columns)
probe_group.create_dataset('start',data=SP.array([start])) probe_group.create_dataset('end',data=SP.array([end])) # one line per donor Xu = np.array(X, dtype='float') Yu = np.array(Y, dtype='float') #Yu -= Yu.mean(0); Yu /= Yu.std(0) if center: Xu -= Xu.mean(0); Xu /= Xu.std(0) uKcis = SP.dot(Xu,Xu.T) uKtrans = uKpop-uKcis uKcis /= uKcis.diagonal().mean() uKtrans /= uKtrans.diagonal().mean() #4.3 perform experiment and store results in out_gene out_gene = {} #print "cis scan" lm=QTL.test_lmm(snps=Xu,pheno=Yu,K=uKtrans,covs=uCov,verbose=True) pv=lm.getPv() RV = {} RV['pv'] = pv RV['qv'] = FDR.qvalues(pv)[0] RV['lambd'] = getLambda(pv) RV['beta'] = lm.getBetaSNP() RV['posLead'] = SP.array([getRealPos(info['pos'][pv[0,:].argmin()],start,end,strand)]) RV['aDirPosLead'] = abs(SP.array([info['pos'][pv[0,:].argmin()]-0.5*(start+end)])) out_group = probe_group.create_group('lmm') dumpDictHdf5(RV,out_group) #print 'ok' except: continue f.close()
vcperm.addFixedEffect() vcperm.addRandomEffect(K=Kallperm) vcperm.addRandomEffect(is_noise=True) vcperm.optimize() permlm0 = vcnull.getLML() - vcperm.getLML() perm_file.write( "\t".join(map(str, [permlm0, permlm1])) + "\n") ## get trans PCs S_R, U_R = sp.linalg.eigh(Kc) F1 = U_R[:, ::-1][:, :10] # add an intercept term F1 = sp.concatenate([F1, sp.ones((F1.shape[0], 1))], 1) test = "lrt" #specify type of statistical test lmm0 = qtl.test_lmm(snps=Msnps, pheno=Y, K=Kallstd, covs=F1, test=test) pvalues = lmm0.getPv( ) # 1xS vector of p-values (S=X.shape[1]) betas = lmm0.getBetaSNP( ) # 1xS vector of effect sizes (S=X.shape[1]) ses = lmm0.beta_ste # 1xS vector of effect sizes standard errors (S=X.shape[1] RV = Mpos RV["pvaluesCisPCs"] = pvalues.T RV["betasCisPCs"] = betas.T RV["sesCisPCs"] = ses.T RV["gene"] = gene test = "lrt" #specify type of statistical test lmm2 = qtl.test_lmm(snps=Msnps,
def main(): geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:] make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir,'logs')) logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)), file_level=logging.DEBUG,console_level=logging.DEBUG) LoggerFactory.log_command(logger,sys.argv[1:]) logger.info('Output directory: %s',out_dir) #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5' #out_dir = '.' #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt' #RNA_start,RNA_end = 0,5 RNA_start,RNA_end = int(RNA_start),int(RNA_end) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results')) logger.info('Loading genotype from %s',geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s',pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) logger.info('Calculating sample relatedness') # non-normalized and normalized sample relatedeness matrix sample_relatedness_unnormalized = dataset.getCovariance(normalize=False) sample_relatedness = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean() sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness')) pl.imshow(sample_relatedness,aspect='auto') plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png')) logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID) N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1]#number of phenotypes logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N,S,P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Start loading covariance from %s',cov_file) cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix() logger.info('Finished') logger.info('Start testing: LM') lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Start testing: LMM') lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, K=sample_relatedness,covs=cov,verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Saving P-values to text file') lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID), header=True,index=False) # Genome-wide manhatton plots for one phenotype: logger.info('Plotting Manhattan plots') manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[12,8]) subpl = plt.subplot(2,1,1) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LM'%p_ID) subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LMM'%p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype logger.info('Plotting phenotype vs. SNP') snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure #find maximum squared beta value pheno_vals, s_idx = dataset.getPhenotypes([p_ID]) imax = lm.pvalues[ip].argmin() i_0 = snps[s_idx,imax]==0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5,2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams logger.info('Plotting P-value histograms') pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) subpl = plt.subplot(1,2,1) plt.hist(pvalues_lm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots logger.info('Plotting Q-Q plots') qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) subpl = plt.subplot(1,2,1) qqplot(pvalues_lm[p_ID].values) plt.title("%s, LM" % p_ID) subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot logger.info('Plotting LM vs LMM P-values') pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3]) plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') ymax = max(plt.xlim()[1],plt.ylim()[1]) plt.plot([0,ymax],[0,ymax],'k--') plt.xlabel('LM') plt.ylabel('LMM') plt.title(p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def main(): if 1: geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[ 1:] if 0: geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5' pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5' norm_mode = 'RIN' out_dir = 'test_v8' K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv' cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt' RNA_start, RNA_end = 0, 5 make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs')) logger = LoggerFactory.get_logger(os.path.join( log_dir, '%s-%s.log' % (RNA_start, RNA_end)), file_level=logging.DEBUG, console_level=logging.DEBUG) LoggerFactory.log_command(logger, sys.argv[1:]) logger.info('Output directory: %s', out_dir) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results')) RNA_start, RNA_end = int(RNA_start), int(RNA_end) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=100000) logger.info('Sample relatedness %s', K_file) logger.info('Loading sample relatedness from %s', K_file) if (K_file == 'None'): sample_relatedness = None else: logger.info('Start loading covariance from %s', K_file) K_df = pd.read_csv(K_file, sep='\t', header=None, index_col=0) # accessions x accessions K_df.index = ['%d' % i for i in K_df.index] K_df.columns = K_df.index sample_relatedness = K_df.loc[dataset.sample_ID, dataset.sample_ID].as_matrix() sample_relatedness_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'sample_relatedness')) pl.imshow(sample_relatedness, aspect='auto') plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png')) logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\ sample_idx=dataset.sample_idx['pheno']) logger.info('Phenotype normalization: %s', norm_mode) if norm_mode == 'None': phenotype_vals = phenotypes.values elif norm_mode == 'RIN': phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values) elif norm_mode == 'boxcox': phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values) else: logger.info('Normalization mode %s is not recognized. Use None', norm_mode) phenotype_vals = phenotypes.values N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1] #number of phenotypes logger.info( 'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N, S, P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[3, 3]) #create the figure plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Sample covariance %s', cov_file) if (cov_file == 'None'): cov = None else: logger.info('Start loading covariance from %s', cov_file) cov_df = pd.read_csv(cov_file, sep='\t', header=0, index_col=0) # cov x accessions cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix() #logger.info('Start testing: LM') #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals, # covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, # columns=phenotype_ID) logger.info('Start testing: LMM') lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'), pheno=phenotype_vals, K=sample_relatedness, covs=cov, verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T, index=dataset.geno_ID, columns=phenotype_ID) #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists( os.path.join(out_results_dir, 'lmm_pval')) logger.info('Saving P-values to text file in %s', lmm_pval_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), # header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID), header=True, index=False) # Genome-wide manhatton plots for one phenotype: manh_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'manhattan')) logger.info('Plotting Manhattan plots in %s', manh_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[12, 8]) #subpl = plt.subplot(2,1,1) #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) #plt.title('%s, LM'%p_ID) #subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'], pv=pvalues_lmm[p_ID].values, chromBounds=chromBounds, thr_plotting=0.05) plt.title('%s, LMM' % p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype snp_pheno_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'snp_pheno')) logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[3, 3]) #create the figure #find maximum squared beta value pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\ sample_idx=dataset.sample_idx['pheno']) imax = lmm.pvalues[ip].argmin() i_0 = snps[s_idx, imax] == 0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx, imax] + 0.05 * np.random.randn(snps[s_idx, imax].shape[0]), pheno_vals.values, '.', alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5, 2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams pval_hist_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'pval_hist')) logger.info('Plotting P-value histograms to %s', pval_hist_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[7, 3]) #subpl = plt.subplot(1,2,1) #plt.hist(pvalues_lm[p_ID].values,20,normed=True) #plt.plot([0,1],[1,1],"r") #plt.title("%s, LM" % p_ID) #plt.xlabel("P-value") #plt.ylabel("Frequency") #subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values, 20, normed=True) plt.plot([0, 1], [1, 1], "r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir, 'qqplot')) logger.info('Plotting Q-Q plots to %s', qqplot_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[7, 3]) #subpl = plt.subplot(1,2,1) #qqplot(pvalues_lm[p_ID].values) #plt.title("%s, LM" % p_ID) #subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot # logger.info('Plotting LM vs LMM P-values') # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): # out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) # fig = plt.figure(figsize=[3,3]) # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') # ymax = max(plt.xlim()[1],plt.ylim()[1]) # plt.plot([0,ymax],[0,ymax],'k--') # plt.xlabel('LM') # plt.ylabel('LMM') # plt.title(p_ID) # fig.savefig(out_file) # plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True): """ Args: K: list of random effects to be considered in the analysis if K is none, it does not consider any random effect expr: correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused idx: indices of the genes to be considered in the analysis i0: gene index from which the anlysis starts i1: gene index to which the analysis stops verbose: if True, print progress recalc: if True, re-do variance decomposition standardize: if True, standardize also expression Returns: pv: matrix of pvalues beta: matrix of correlations info: dictionary annotates pv and beta rows and columns, containing gene_idx_row: index of the genes in rows conv: boolean vetor marking genes for which variance decomposition has converged gene_row: annotate rows of matrices """ if idx==None: if i0==None or i1==None: i0 = 0; i1 = self.G idx = SP.arange(i0,i1) elif type(idx)!=SP.ndarray: idx = SP.array(idx) idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell if K!=None: if type(K)!=list: K = [K] if (recalc==True and len(K)>1) or (recalc==True and self.var==None): print 'performing variance decomposition first...' var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False) var = var_raw/var_raw.sum(1)[:,SP.newaxis] elif recalc==False and len(K)>1: assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method' warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.') var_raw = self.var var_info = self.var_info var = var_raw/var_raw.sum(1)[:,SP.newaxis] lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True} Yidx = self.Y[:,idx] Ystd = Yidx-Yidx.mean(0) Ystd/= Yidx.std(0) #delta optimization might be more efficient if expr==None: expr = Ystd elif standardize==True: exprStd = expr exprStd = expr-expr.mean(0) exprStd/= expr.std(0) expr = exprStd _G1 = idx.shape[0] _G2 = expr.shape[1] geneID = SP.zeros(_G1,dtype=str) beta = SP.zeros((_G1,_G2)) pv = SP.zeros((_G1,_G2)) count = 0 for ids in range(_G1): if verbose: print '.. fitting gene %d'%ids # extract a single gene if K!=None: if len(K)>1: if var_info['conv'][count]==True: _K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0) _K/= _K.diagonal().mean() else: _K = None else: _K = K[0] else: _K = None lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,verbose=False,**lmm_params) pv[count,:] = lm.getPv()[0,:] beta[count,:] = lm.getBetaSNP()[0,:] count+=1 if self.geneID!=None: geneID = SP.array(self.geneID)[idx] if recalc==True and K!=None and len(K)>1: info = {'conv':var_info['conv'],'gene_idx_row':idx} else: info = {'gene_idx_row':idx} if geneID!=None: info['gene_row'] = geneID return pv, beta, info