def main(): geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03' out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) graphics_prefix = os.path.join(out_graphics_dir, 'calc_k_panama_2016-02-03-') results_prefix = os.path.join(out_dir, 'calc_k_panama_2016-02-03-') logger = LoggerFactory.get_logger(os.path.join( out_dir, 'calc_k_panama_2016-02-03.log'), file_level=logging.DEBUG, console_level=logging.DEBUG) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # import data phenotypes, sample_idx = dataset.getPhenotypes(intersection=True) sample_relatedness = dataset.getCovariance() # determine the number of ranks to consider in the PANAMA matrix # by looking at the variance explained by PCs cum_var = panama.PC_varExplained(phenotypes.values) out_file = graphics_prefix + 'cum_var.png' fig = plt.figure(figsize=[5, 4]) subplt = pl.subplot(1, 1, 1) pl.bar(sp.arange(50) + 0.5, cum_var[:50], width=1) pl.xlim(0, 50) ticks = sp.linspace(0, 50, 11) ticks[0] = 1 subplt.set_xticks(ticks) fig.savefig(out_file) plt.close(fig) for r in [10, 15, 20]: p = panama.PANAMA(Y=phenotypes.values, Kpop=sample_relatedness) logger.info('Training r=%d', r) p.train(rank=r) draw_and_save_panama(p, graphics_prefix + '_K%d' % r, results_prefix + '_K%d' % r)
def main(): geno_file, pheno_file, out_dir = sys.argv[1:] #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' #out_dir = '.' logger = LoggerFactory.get_logger(os.path.join(out_dir, 'get_geno_pos.log')) LoggerFactory.log_command(logger, sys.argv[1:]) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # getting genotypes #snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=100000) logger.info('Writing output to directory %s', out_dir) position = position.astype(int) chromBounds = chromBounds.astype(int) position.to_csv(os.path.join(out_dir, 'position.txt'), header=True, index=False, sep='\t') np.savetxt(os.path.join(out_dir, 'chromBounds.txt'), chromBounds, delimiter=",")
def main(): geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:] #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' #norm_mode = 'RIN' #out_dir = '.' #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5' #RNA_start,RNA_end = 0,2 RNA_start,RNA_end = int(RNA_start),int(RNA_end) make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir,'logs')) logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)), file_level=logging.DEBUG,console_level=logging.DEBUG) LoggerFactory.log_command(logger,sys.argv[1:]) logger.info('Output directory: %s',out_dir) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results')) logger.info('Loading genotype from %s',geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s',pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) logger.info('Loading sample relatedness from %s',panama_file) panama_f = h5py.File(panama_file,'r') Ktot = panama_f['Ktot'][:] # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID) logger.info('Normalization: %s',norm_mode) if norm_mode=='None': phenotype_vals = phenotypes.values elif norm_mode=='RIN': phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values) elif norm_mode=='boxcox': phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values) else: logger.info('Normalization mode %s is not recognized. Exit',norm_mode) N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1]#number of phenotypes logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N,S,P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) #logger.info('Start testing: LM') #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, # covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, # columns=phenotype_ID) logger.info('Start testing: LMM') #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, # K=sample_relatedness,covs=cov,verbose=True) lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals, K=Ktot,covs=None,verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Saving P-values to text file') #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), # header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID), header=True,index=False) # Genome-wide manhatton plots for one phenotype: logger.info('Plotting Manhattan plots') manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[12,8]) #subpl = plt.subplot(2,1,1) #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) #plt.title('%s, LM'%p_ID) #subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LMM'%p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype logger.info('Plotting phenotype vs. SNP') snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure #find maximum squared beta value pheno_vals, s_idx = dataset.getPhenotypes([p_ID]) imax = lmm.pvalues[ip].argmin() i_0 = snps[s_idx,imax]==0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5,2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams logger.info('Plotting P-value histograms') pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) #subpl = plt.subplot(1,2,1) #plt.hist(pvalues_lm[p_ID].values,20,normed=True) #plt.plot([0,1],[1,1],"r") #plt.title("%s, LM" % p_ID) #plt.xlabel("P-value") #plt.ylabel("Frequency") #subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots logger.info('Plotting Q-Q plots') qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) #subpl = plt.subplot(1,2,1) #qqplot(pvalues_lm[p_ID].values) #plt.title("%s, LM" % p_ID) #subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot #logger.info('Plotting LM vs LMM P-values') #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): # out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) # fig = plt.figure(figsize=[3,3]) # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') # ymax = max(plt.xlim()[1],plt.ylim()[1]) # plt.plot([0,ymax],[0,ymax],'k--') # plt.xlabel('LM') # plt.ylabel('LMM') # plt.title(p_ID) # fig.savefig(out_file) # plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
#--------------------# #### Prepare data #### #--------------------# # Reader instance for genotypes geno_reader = gr.genotype_reader_tables( '/home/hugot/projects/20150501_accessions/genotypes/snp250k/pygwas_genotypes_limix.hdf5' ) # Reader instance for phenotypes pheno_reader = phr.pheno_reader_tables( '/home/hugot/projects/20150501_accessions/phenotypes/limix/accession_phenotypes_silique_early.hdf5' ) # Combine genotypes and phenotypes into limix-specific object dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # Get SNPs, phenotypes and positions in respective variables snps = dataset.getGenotypes() phenotypes = dataset.getPhenotypes(intersection=True)[0] pos = dataset.getPos() pos, chromBounds = data_util.estCumPos(position=pos, offset=0) # Subset only TSS trait for multi-trait LMM phenotypes_tss = phenotypes[['totalbr_mean_ln', 'totalbr_mean_hn']] # Estimate relatedness matrix sample_relatedness = dataset.getCovariance(normalize=True, center=True)
def main(): if 1: geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[ 1:] if 0: geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5' pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5' norm_mode = 'RIN' out_dir = 'test_v8' K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv' cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt' RNA_start, RNA_end = 0, 5 make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs')) logger = LoggerFactory.get_logger(os.path.join( log_dir, '%s-%s.log' % (RNA_start, RNA_end)), file_level=logging.DEBUG, console_level=logging.DEBUG) LoggerFactory.log_command(logger, sys.argv[1:]) logger.info('Output directory: %s', out_dir) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results')) RNA_start, RNA_end = int(RNA_start), int(RNA_end) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=100000) logger.info('Sample relatedness %s', K_file) logger.info('Loading sample relatedness from %s', K_file) if (K_file == 'None'): sample_relatedness = None else: logger.info('Start loading covariance from %s', K_file) K_df = pd.read_csv(K_file, sep='\t', header=None, index_col=0) # accessions x accessions K_df.index = ['%d' % i for i in K_df.index] K_df.columns = K_df.index sample_relatedness = K_df.loc[dataset.sample_ID, dataset.sample_ID].as_matrix() sample_relatedness_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'sample_relatedness')) pl.imshow(sample_relatedness, aspect='auto') plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png')) logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\ sample_idx=dataset.sample_idx['pheno']) logger.info('Phenotype normalization: %s', norm_mode) if norm_mode == 'None': phenotype_vals = phenotypes.values elif norm_mode == 'RIN': phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values) elif norm_mode == 'boxcox': phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values) else: logger.info('Normalization mode %s is not recognized. Use None', norm_mode) phenotype_vals = phenotypes.values N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1] #number of phenotypes logger.info( 'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N, S, P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[3, 3]) #create the figure plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Sample covariance %s', cov_file) if (cov_file == 'None'): cov = None else: logger.info('Start loading covariance from %s', cov_file) cov_df = pd.read_csv(cov_file, sep='\t', header=0, index_col=0) # cov x accessions cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix() #logger.info('Start testing: LM') #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals, # covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, # columns=phenotype_ID) logger.info('Start testing: LMM') lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'), pheno=phenotype_vals, K=sample_relatedness, covs=cov, verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T, index=dataset.geno_ID, columns=phenotype_ID) #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists( os.path.join(out_results_dir, 'lmm_pval')) logger.info('Saving P-values to text file in %s', lmm_pval_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), # header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID), header=True, index=False) # Genome-wide manhatton plots for one phenotype: manh_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'manhattan')) logger.info('Plotting Manhattan plots in %s', manh_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[12, 8]) #subpl = plt.subplot(2,1,1) #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) #plt.title('%s, LM'%p_ID) #subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'], pv=pvalues_lmm[p_ID].values, chromBounds=chromBounds, thr_plotting=0.05) plt.title('%s, LMM' % p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype snp_pheno_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'snp_pheno')) logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[3, 3]) #create the figure #find maximum squared beta value pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\ sample_idx=dataset.sample_idx['pheno']) imax = lmm.pvalues[ip].argmin() i_0 = snps[s_idx, imax] == 0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx, imax] + 0.05 * np.random.randn(snps[s_idx, imax].shape[0]), pheno_vals.values, '.', alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5, 2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams pval_hist_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'pval_hist')) logger.info('Plotting P-value histograms to %s', pval_hist_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[7, 3]) #subpl = plt.subplot(1,2,1) #plt.hist(pvalues_lm[p_ID].values,20,normed=True) #plt.plot([0,1],[1,1],"r") #plt.title("%s, LM" % p_ID) #plt.xlabel("P-value") #plt.ylabel("Frequency") #subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values, 20, normed=True) plt.plot([0, 1], [1, 1], "r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir, 'qqplot')) logger.info('Plotting Q-Q plots to %s', qqplot_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[7, 3]) #subpl = plt.subplot(1,2,1) #qqplot(pvalues_lm[p_ID].values) #plt.title("%s, LM" % p_ID) #subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot # logger.info('Plotting LM vs LMM P-values') # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): # out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) # fig = plt.figure(figsize=[3,3]) # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') # ymax = max(plt.xlim()[1],plt.ylim()[1]) # plt.plot([0,ymax],[0,ymax],'k--') # plt.xlabel('LM') # plt.ylabel('LMM') # plt.title(p_ID) # fig.savefig(out_file) # plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def main(): geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:] make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir,'logs')) logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)), file_level=logging.DEBUG,console_level=logging.DEBUG) LoggerFactory.log_command(logger,sys.argv[1:]) logger.info('Output directory: %s',out_dir) #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5' #out_dir = '.' #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt' #RNA_start,RNA_end = 0,5 RNA_start,RNA_end = int(RNA_start),int(RNA_end) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results')) logger.info('Loading genotype from %s',geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s',pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) logger.info('Calculating sample relatedness') # non-normalized and normalized sample relatedeness matrix sample_relatedness_unnormalized = dataset.getCovariance(normalize=False) sample_relatedness = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean() sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness')) pl.imshow(sample_relatedness,aspect='auto') plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png')) logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID) N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1]#number of phenotypes logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N,S,P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Start loading covariance from %s',cov_file) cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix() logger.info('Finished') logger.info('Start testing: LM') lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Start testing: LMM') lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, K=sample_relatedness,covs=cov,verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Saving P-values to text file') lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID), header=True,index=False) # Genome-wide manhatton plots for one phenotype: logger.info('Plotting Manhattan plots') manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[12,8]) subpl = plt.subplot(2,1,1) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LM'%p_ID) subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LMM'%p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype logger.info('Plotting phenotype vs. SNP') snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure #find maximum squared beta value pheno_vals, s_idx = dataset.getPhenotypes([p_ID]) imax = lm.pvalues[ip].argmin() i_0 = snps[s_idx,imax]==0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5,2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams logger.info('Plotting P-value histograms') pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) subpl = plt.subplot(1,2,1) plt.hist(pvalues_lm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots logger.info('Plotting Q-Q plots') qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) subpl = plt.subplot(1,2,1) qqplot(pvalues_lm[p_ID].values) plt.title("%s, LM" % p_ID) subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot logger.info('Plotting LM vs LMM P-values') pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3]) plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') ymax = max(plt.xlim()[1],plt.ylim()[1]) plt.plot([0,ymax],[0,ymax],'k--') plt.xlabel('LM') plt.ylabel('LMM') plt.title(p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def main(): geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1kgen_filter3.hdf5' pheno_file = '/gale/netapp/home/shhuang/data/1001_genomes/seed_size/accx_size.hdf5' expr_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_normCounts_k4_vst2T.hdf5' out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/draw_seedsize_plots_2016-11-30' out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) graphics_prefix = os.path.join(out_graphics_dir, 'draw_seedsize_plots_2016-11-30-') results_prefix = os.path.join(out_dir, 'draw_seedsize_plots_2016-11-30-') logger = LoggerFactory.get_logger(os.path.join( out_dir, 'draw_seedsize_plots_2016-11-30.log'), file_level=logging.DEBUG, console_level=logging.DEBUG) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) logger.info('Loading expression from %s', expr_file) expr_reader = phr.pheno_reader_tables(expr_file) expr_reader.sample_ID = strip_xvec(expr_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) exprset = data.QTLData(geno_reader=geno_reader, pheno_reader=expr_reader) # import data #phenotypes,sample_idx = dataset.getPhenotypes(intersection=False) pheno_sample_select = np.ones(pheno_reader.sample_ID.shape[0], dtype=bool) phenotypes, pheno_sample_idx = pheno_reader.getPhenotypes( sample_idx=pheno_sample_select) expr_sample_select = np.ones(expr_reader.sample_ID.shape[0], dtype=bool) expr, expr_sample_idx = expr_reader.getPhenotypes( sample_idx=expr_sample_select) snps = geno_reader.getGenotypes() position = geno_reader.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=0) gid_start, gid_end = geno_reader.getGenoIndex(chrom=4, pos_start=(4, 13393142), pos_end=(4, 13393144)) gid_range = np.arange(gid_start, gid_end + 1) for ig, g_ID in enumerate(gid_range): g_ID = gid_range[ig:(ig + 1)] print(g_ID) gs_idx = dataset.sample_idx["geno"].values ps_idx = dataset.sample_idx["pheno"].values egs_idx = exprset.sample_idx["geno"].values eps_idx = exprset.sample_idx["pheno"].values snps_sub = snps[np.ix_(gs_idx, g_ID)][:, 0] phenotypes_sub = phenotypes.values[ps_idx] esnps_sub = snps[np.ix_(egs_idx, g_ID)][:, 0] fba5_sub = expr['AT4G26530'].values[eps_idx] position_sub = position.iloc[[g_ID[0]]] print(position_sub) point_file = graphics_prefix + 'point_chr%d_%d.png' % ( position_sub['chrom'], position_sub['pos']) fig = plt.figure(figsize=[5, 2.5]) #create the figure plt.subplot(1, 2, 1) plt.plot(snps_sub + 0.05 * np.random.randn(snps_sub.shape[0]), phenotypes_sub, '.') plt.xlabel("SNP") plt.ylabel("Seed size") plt.subplot(1, 2, 2) plt.plot(esnps_sub + 0.05 * np.random.randn(esnps_sub.shape[0]), fba5_sub, '.') plt.xlabel("SNP") plt.ylabel("FBA5 expression") plt.tight_layout() fig.savefig(point_file) plt.close(fig) bxp_file = graphics_prefix + 'bxp_chr%d_%d.png' % ( position_sub['chrom'], position_sub['pos']) fig = plt.figure(figsize=[5, 2.5]) #create the figure plt.subplot(1, 2, 1) phenotypes_box = [ phenotypes_sub[snps_sub == 0], phenotypes_sub[snps_sub == 2] ] plt.boxplot(phenotypes_box) plt.xlabel("SNP") plt.ylabel("Seed size") plt.subplot(1, 2, 2) fba5_box = [fba5_sub[esnps_sub == 0], fba5_sub[esnps_sub == 2]] plt.boxplot(fba5_box) plt.xlabel("SNP") plt.ylabel("FBA5 expression") plt.tight_layout() fig.savefig(bxp_file) plt.close(fig)