def main(): geno_in, norm_cov, cov_out_hdf5, cov_out_csv = sys.argv[1:] logger = LoggerFactory.get_logger(cov_out_hdf5 + '.log') LoggerFactory.log_command(logger, sys.argv[1:]) ## Import genotype data logger.info('Loading genotype from %s', geno_in) geno_reader = gr.genotype_reader_tables(geno_in) if (norm_cov == '1'): logger.info('Normalizing') norm = True else: logger.info('NOT normalizing') norm = False sample_relatedness = geno_reader.getCovariance(normalize=norm) logger.info('Saving covariance to HDF5 file %s', cov_out_hdf5) out_dict = {'Cov': sample_relatedness} o = h5py.File(cov_out_hdf5, 'w') util_functions.smartDumpDictHdf5(out_dict, o) o.close() logger.info('Saving covariance to CSV file %s', cov_out_csv) save_cov_in_text_format(cov_out_csv, sample_relatedness, geno_reader.sample_ID) logger.info('Done!')
def main(): geno_in, scale_kinship, kinship_out_hdf5, kinship_out_csv = sys.argv[1:] logger = LoggerFactory.get_logger(kinship_out_hdf5 + '.log') LoggerFactory.log_command(logger, sys.argv[1:]) ## Import genotype data geno = genotype.load_hdf5_genotype_data(geno_in) SNP_acc = geno.accessions logger.info('Finished reading SNP from %s', geno_in) logger.info('Start calculating kinship') K = geno.get_ibs_kinship_matrix() if (scale_kinship == '1'): logger.info('Scaling') K = kinship.scale_k(K) else: logger.info('NOT scaling') logger.info('Saving kinship to HDF5 file %s', kinship_out_hdf5) kinship.save_kinship_to_file(kinship_out_hdf5, K, geno.accessions, geno.num_snps) logger.info('Saving kinship to CSV file %s', kinship_out_csv) save_kinship_in_text_format(kinship_out_csv, K, geno.accessions) logger.info('Done!')
def main(): usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-O", "--outfile", action="store", dest='outfile', type=str, help='The output hdf5 file wiht the resulting data', default="example_out") parser.add_option("-R", "--dmr", action="store", dest='dmr', help="Read DMR tsv file (filename)", default=None) parser.add_option("-S", "--dms", action="store", dest='dms', help="Read methylation site tsv file (filename)", default=None) (options, args) = parser.parse_args() logger = LoggerFactory.get_logger(options.outfile + '.log') LoggerFactory.log_command(logger, sys.argv) hdf = h5py.File(options.outfile) if options.dmr is not None: logger.info('Converting DMR tsv %s and write to %s', options.dmr, options.outfile) convert_dmr_tsv(hdf, options.dmr, chrom=None, start=None, end=None, sample_subset=None) if options.dms is not None: logger.info('Converting DMS tsv %s and write to %s', options.dms, options.outfile) convert_dms_tsv(hdf, options.dms, chrom=None, start=None, end=None, sample_subset=None) logger.info('Done!')
def main(): geno_in, acc_in, maf_lb, maf_ub, geno_out = sys.argv[1:] logger = LoggerFactory.get_logger(geno_out + '.log') LoggerFactory.log_command(logger, sys.argv[1:]) maf_lb, maf_ub = float(maf_lb), float(maf_ub) ## Import genotype data geno = genotype.load_hdf5_genotype_data(geno_in) SNP_acc = geno.accessions logger.info('Finished reading SNP from %s', geno_in) ## accession subset with open(acc_in, 'rb') as f: reader = csv.reader(f) file_acc = list(reader) logger.info('Finished reading accession subset from %s', acc_in) ## get common accessions in the same order for genotype and accession subset acc_common = [acc for acc in SNP_acc if acc in file_acc] ## filtering logger.info( 'Start subsetting accessions and filtering SNPs by MAF >%f and <=%f', maf_lb, maf_ub) match = lambda a, b: [b.index(x) if x in b else None for x in a] geno.filter_accessions_ix(match(acc_common, SNP_acc)) (num_snps, num_removed) = filter_maf_snps(geno, maf_lb, maf_ub) logger.info('Removed %d from %d SNPs', num_removed, num_snps) logger.info('Number of SNPs remaining %d', geno.num_snps) logger.info('Start writing filtered genotype file to %s', geno_out) geno.save_as_hdf5(geno_out) logger.info('Finished') logger.info('Done!')
def main(): geno_file, pheno_file, out_dir = sys.argv[1:] #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' #out_dir = '.' logger = LoggerFactory.get_logger(os.path.join(out_dir, 'get_geno_pos.log')) LoggerFactory.log_command(logger, sys.argv[1:]) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # getting genotypes #snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=100000) logger.info('Writing output to directory %s', out_dir) position = position.astype(int) chromBounds = chromBounds.astype(int) position.to_csv(os.path.join(out_dir, 'position.txt'), header=True, index=False, sep='\t') np.savetxt(os.path.join(out_dir, 'chromBounds.txt'), chromBounds, delimiter=",")
def main(): geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03' out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) graphics_prefix = os.path.join(out_graphics_dir, 'calc_k_panama_2016-02-03-') results_prefix = os.path.join(out_dir, 'calc_k_panama_2016-02-03-') logger = LoggerFactory.get_logger(os.path.join( out_dir, 'calc_k_panama_2016-02-03.log'), file_level=logging.DEBUG, console_level=logging.DEBUG) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # import data phenotypes, sample_idx = dataset.getPhenotypes(intersection=True) sample_relatedness = dataset.getCovariance() # determine the number of ranks to consider in the PANAMA matrix # by looking at the variance explained by PCs cum_var = panama.PC_varExplained(phenotypes.values) out_file = graphics_prefix + 'cum_var.png' fig = plt.figure(figsize=[5, 4]) subplt = pl.subplot(1, 1, 1) pl.bar(sp.arange(50) + 0.5, cum_var[:50], width=1) pl.xlim(0, 50) ticks = sp.linspace(0, 50, 11) ticks[0] = 1 subplt.set_xticks(ticks) fig.savefig(out_file) plt.close(fig) for r in [10, 15, 20]: p = panama.PANAMA(Y=phenotypes.values, Kpop=sample_relatedness) logger.info('Training r=%d', r) p.train(rank=r) draw_and_save_panama(p, graphics_prefix + '_K%d' % r, results_prefix + '_K%d' % r)
def main(): geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:] #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5' #norm_mode = 'RIN' #out_dir = '.' #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5' #RNA_start,RNA_end = 0,2 RNA_start,RNA_end = int(RNA_start),int(RNA_end) make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir,'logs')) logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)), file_level=logging.DEBUG,console_level=logging.DEBUG) LoggerFactory.log_command(logger,sys.argv[1:]) logger.info('Output directory: %s',out_dir) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results')) logger.info('Loading genotype from %s',geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s',pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) logger.info('Loading sample relatedness from %s',panama_file) panama_f = h5py.File(panama_file,'r') Ktot = panama_f['Ktot'][:] # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID) logger.info('Normalization: %s',norm_mode) if norm_mode=='None': phenotype_vals = phenotypes.values elif norm_mode=='RIN': phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values) elif norm_mode=='boxcox': phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values) else: logger.info('Normalization mode %s is not recognized. Exit',norm_mode) N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1]#number of phenotypes logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N,S,P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) #logger.info('Start testing: LM') #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, # covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, # columns=phenotype_ID) logger.info('Start testing: LMM') #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, # K=sample_relatedness,covs=cov,verbose=True) lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals, K=Ktot,covs=None,verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Saving P-values to text file') #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), # header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID), header=True,index=False) # Genome-wide manhatton plots for one phenotype: logger.info('Plotting Manhattan plots') manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[12,8]) #subpl = plt.subplot(2,1,1) #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) #plt.title('%s, LM'%p_ID) #subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LMM'%p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype logger.info('Plotting phenotype vs. SNP') snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure #find maximum squared beta value pheno_vals, s_idx = dataset.getPhenotypes([p_ID]) imax = lmm.pvalues[ip].argmin() i_0 = snps[s_idx,imax]==0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5,2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams logger.info('Plotting P-value histograms') pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) #subpl = plt.subplot(1,2,1) #plt.hist(pvalues_lm[p_ID].values,20,normed=True) #plt.plot([0,1],[1,1],"r") #plt.title("%s, LM" % p_ID) #plt.xlabel("P-value") #plt.ylabel("Frequency") #subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots logger.info('Plotting Q-Q plots') qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) #subpl = plt.subplot(1,2,1) #qqplot(pvalues_lm[p_ID].values) #plt.title("%s, LM" % p_ID) #subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot #logger.info('Plotting LM vs LMM P-values') #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): # out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) # fig = plt.figure(figsize=[3,3]) # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') # ymax = max(plt.xlim()[1],plt.ylim()[1]) # plt.plot([0,ymax],[0,ymax],'k--') # plt.xlabel('LM') # plt.ylabel('LMM') # plt.title(p_ID) # fig.savefig(out_file) # plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def main(): parser = argparse.ArgumentParser( description='Modified genome FASTA file by 5mC') parser.add_argument('genome') parser.add_argument('allc_h5') parser.add_argument('genome_mod') parser.add_argument('--zero', help='whether the allc table is 0-index', action='store_true') args = parser.parse_args() genome, allc_h5, genome_mod = args.genome, args.allc_h5, args.genome_mod offset = 0 if args.zero else 1 # allc table positions are 1-based logger = LoggerFactory.get_logger(genome_mod + '.log') LoggerFactory.log_command(logger, sys.argv) logger.info('Getting unmodified genome FASTA file from %s', genome) coord_pat = re.compile('^(chr)?(.+)$') fa_list = list(SeqIO.parse(genome, "fasta")) fa_dict = dict((r.id, r.seq) for r in fa_list) fa_key_by_chrom = dict() for k in fa_dict.keys(): # '1' -> 'chr1' or '1' -> '1' coord_mat = coord_pat.match(k) chrom = coord_mat.group(2) fa_key_by_chrom[chrom] = k logger.info('Chromosomes: %s', fa_key_by_chrom.keys()) logger.info('Modifying FASTA by all_c %s', allc_h5) fa_dict_out = dict() for chrom, fa_key in fa_key_by_chrom.items(): logger.info('Reading allc table for chromosome %s from %s', chrom, allc_h5) hdf = read_hdf(allc_h5, 'allc_' + str(chrom), where=['mcall==1'], columns=['pos', 'strand', 'context']) mc_pos, mc_strand, mc_type = hdf['pos'], hdf['strand'], hdf['context'] logger.info('Number of methylated C: %d', len(mc_pos)) seq = fa_dict[fa_key].tomutable() for p, s, t in zip(mc_pos, mc_strand, mc_type): p0 = p - offset if (p0 > len(seq)): logger.info('Index out of bound for %s: mc_pos %d, len %d', fa_key, p0, len(seq)) continue if s == '+': if (seq[p0] == 'C' or seq[p0] == 'c'): seq[p0] = 'm' else: logger.warn( 'Sequence %s: expected to see C/c (%s:%s) at %d, but saw %s', fa_key, t, s, p0, seq[p0]) elif s == '-': if (seq[p0] == 'G' or seq[p0] == 'g'): seq[p0] = '1' else: logger.warn( 'Sequence %s: expected to see G/g (%s:%s) at %d, but saw %s', fa_key, t, s, p0, seq[p0]) else: logger.warn('Sequence %s: unrecognized strand %s at %d', fa_key, s, p0) fa_dict_out[fa_key] = seq for r in fa_list: r.seq = fa_dict_out[r.id] with open(genome_mod, 'w') as ofa: SeqIO.write(fa_list, ofa, 'fasta') logger.info('Done!')
def main(): if 1: geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[ 1:] if 0: geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5' pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5' norm_mode = 'RIN' out_dir = 'test_v8' K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv' cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt' RNA_start, RNA_end = 0, 5 make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs')) logger = LoggerFactory.get_logger(os.path.join( log_dir, '%s-%s.log' % (RNA_start, RNA_end)), file_level=logging.DEBUG, console_level=logging.DEBUG) LoggerFactory.log_command(logger, sys.argv[1:]) logger.info('Output directory: %s', out_dir) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results')) RNA_start, RNA_end = int(RNA_start), int(RNA_end) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=100000) logger.info('Sample relatedness %s', K_file) logger.info('Loading sample relatedness from %s', K_file) if (K_file == 'None'): sample_relatedness = None else: logger.info('Start loading covariance from %s', K_file) K_df = pd.read_csv(K_file, sep='\t', header=None, index_col=0) # accessions x accessions K_df.index = ['%d' % i for i in K_df.index] K_df.columns = K_df.index sample_relatedness = K_df.loc[dataset.sample_ID, dataset.sample_ID].as_matrix() sample_relatedness_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'sample_relatedness')) pl.imshow(sample_relatedness, aspect='auto') plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png')) logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\ sample_idx=dataset.sample_idx['pheno']) logger.info('Phenotype normalization: %s', norm_mode) if norm_mode == 'None': phenotype_vals = phenotypes.values elif norm_mode == 'RIN': phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values) elif norm_mode == 'boxcox': phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values) else: logger.info('Normalization mode %s is not recognized. Use None', norm_mode) phenotype_vals = phenotypes.values N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1] #number of phenotypes logger.info( 'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N, S, P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[3, 3]) #create the figure plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Sample covariance %s', cov_file) if (cov_file == 'None'): cov = None else: logger.info('Start loading covariance from %s', cov_file) cov_df = pd.read_csv(cov_file, sep='\t', header=0, index_col=0) # cov x accessions cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix() #logger.info('Start testing: LM') #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals, # covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, # columns=phenotype_ID) logger.info('Start testing: LMM') lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'), pheno=phenotype_vals, K=sample_relatedness, covs=cov, verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T, index=dataset.geno_ID, columns=phenotype_ID) #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists( os.path.join(out_results_dir, 'lmm_pval')) logger.info('Saving P-values to text file in %s', lmm_pval_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), # header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID), header=True, index=False) # Genome-wide manhatton plots for one phenotype: manh_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'manhattan')) logger.info('Plotting Manhattan plots in %s', manh_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[12, 8]) #subpl = plt.subplot(2,1,1) #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) #plt.title('%s, LM'%p_ID) #subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'], pv=pvalues_lmm[p_ID].values, chromBounds=chromBounds, thr_plotting=0.05) plt.title('%s, LMM' % p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype snp_pheno_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'snp_pheno')) logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[3, 3]) #create the figure #find maximum squared beta value pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\ sample_idx=dataset.sample_idx['pheno']) imax = lmm.pvalues[ip].argmin() i_0 = snps[s_idx, imax] == 0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx, imax] + 0.05 * np.random.randn(snps[s_idx, imax].shape[0]), pheno_vals.values, '.', alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5, 2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams pval_hist_dir = make_sure_path_exists( os.path.join(out_graphics_dir, 'pval_hist')) logger.info('Plotting P-value histograms to %s', pval_hist_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[7, 3]) #subpl = plt.subplot(1,2,1) #plt.hist(pvalues_lm[p_ID].values,20,normed=True) #plt.plot([0,1],[1,1],"r") #plt.title("%s, LM" % p_ID) #plt.xlabel("P-value") #plt.ylabel("Frequency") #subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values, 20, normed=True) plt.plot([0, 1], [1, 1], "r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir, 'qqplot')) logger.info('Plotting Q-Q plots to %s', qqplot_dir) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir, '%s.png' % p_ID) fig = plt.figure(figsize=[7, 3]) #subpl = plt.subplot(1,2,1) #qqplot(pvalues_lm[p_ID].values) #plt.title("%s, LM" % p_ID) #subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot # logger.info('Plotting LM vs LMM P-values') # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): # out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) # fig = plt.figure(figsize=[3,3]) # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') # ymax = max(plt.xlim()[1],plt.ylim()[1]) # plt.plot([0,ymax],[0,ymax],'k--') # plt.xlabel('LM') # plt.ylabel('LMM') # plt.title(p_ID) # fig.savefig(out_file) # plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def main(): geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:] make_sure_path_exists(out_dir) log_dir = make_sure_path_exists(os.path.join(out_dir,'logs')) logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)), file_level=logging.DEBUG,console_level=logging.DEBUG) LoggerFactory.log_command(logger,sys.argv[1:]) logger.info('Output directory: %s',out_dir) #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5' #out_dir = '.' #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt' #RNA_start,RNA_end = 0,5 RNA_start,RNA_end = int(RNA_start),int(RNA_end) out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics')) out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results')) logger.info('Loading genotype from %s',geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s',pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # getting genotypes snps = dataset.getGenotypes() #SNPS position = dataset.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) logger.info('Calculating sample relatedness') # non-normalized and normalized sample relatedeness matrix sample_relatedness_unnormalized = dataset.getCovariance(normalize=False) sample_relatedness = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean() sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness')) pl.imshow(sample_relatedness,aspect='auto') plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png')) logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end) phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end] phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID) N = snps.shape[0] #number of individuals S = snps.shape[1] #number of SNPs P = phenotype_vals.shape[1]#number of phenotypes logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d', N,S,P) logger.info('Plotting phenotype histograms') phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(phenohist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Start loading covariance from %s',cov_file) cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix() logger.info('Finished') logger.info('Start testing: LM') lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, covs=cov,verbose=True) #convert P-values to a DataFrame for nice output writing: pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Start testing: LMM') lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values, K=sample_relatedness,covs=cov,verbose=True) pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID, columns=phenotype_ID) logger.info('Saving P-values to text file') lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval')) lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID), header=True,index=False) pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID), header=True,index=False) # Genome-wide manhatton plots for one phenotype: logger.info('Plotting Manhattan plots') manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(manh_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[12,8]) subpl = plt.subplot(2,1,1) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LM'%p_ID) subpl = plt.subplot(2,1,2) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) plt.title('%s, LMM'%p_ID) fig.savefig(out_file) plt.close(fig) # SNP vs. phenotype logger.info('Plotting phenotype vs. SNP') snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3])#create the figure #find maximum squared beta value pheno_vals, s_idx = dataset.getPhenotypes([p_ID]) imax = lm.pvalues[ip].argmin() i_0 = snps[s_idx,imax]==0 #plot SNP vs. phenotype for max beta plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5) plt.xlabel("SNP") plt.ylabel("phenotype") plt.xlim([-0.5,2.5]) plt.title("%s" % p_ID) fig.savefig(out_file) plt.close(fig) # P-value histgrams logger.info('Plotting P-value histograms') pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) subpl = plt.subplot(1,2,1) plt.hist(pvalues_lm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") subpl = plt.subplot(1,2,2) plt.hist(pvalues_lmm[p_ID].values,20,normed=True) plt.plot([0,1],[1,1],"r") plt.title("%s, LMM" % p_ID) plt.xlabel("P-value") plt.ylabel("Frequency") fig.savefig(out_file) plt.close(fig) # Quantile-Quantile plots logger.info('Plotting Q-Q plots') qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot')) for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(qqplot_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[7,3]) subpl = plt.subplot(1,2,1) qqplot(pvalues_lm[p_ID].values) plt.title("%s, LM" % p_ID) subpl = plt.subplot(1,2,2) qqplot(pvalues_lmm[p_ID].values) plt.title("%s, LMM" % p_ID) fig.savefig(out_file) plt.close(fig) # P value scatter plot logger.info('Plotting LM vs LMM P-values') pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm')) for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]): out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID) fig = plt.figure(figsize=[3,3]) plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.') ymax = max(plt.xlim()[1],plt.ylim()[1]) plt.plot([0,ymax],[0,ymax],'k--') plt.xlabel('LM') plt.ylabel('LMM') plt.title(p_ID) fig.savefig(out_file) plt.close(fig) logger.info('Done with all plots!') logger.info('Done!')
def main(): geno_hdf5, kinship_hdf5, RNA_csv, COV_file, RNA_start, RNA_end, out_file = sys.argv[ 1:] RNA_start, RNA_end = int(RNA_start), int(RNA_end) logger = LoggerFactory.get_logger(out_file + '.log', file_level=logging.DEBUG, console_level=logging.DEBUG) LoggerFactory.log_command(logger, sys.argv[1:]) step_list = [""] ## Import genotype data logger.info('Start reading SNP from %s', geno_hdf5) geno = genotype.load_hdf5_genotype_data(geno_hdf5) SNP_accx = ['X' + acc for acc in geno.accessions] logger.info('Finished reading SNP from %s', geno_hdf5) ## Import phenotype data logger.info('Finished reading RNA from %s', RNA_csv) RNA_df = pd.read_csv(RNA_csv, sep='\t', header=0, index_col=0) # genes x accessions RNA_accx = list(RNA_df.columns.values) RNA_genes = list(RNA_df.index) logger.info('Finished reading RNA from %s', RNA_csv) ## get common accessions in the same order for genotype, phenotype and phenotype covariates logger.info('Consolidate accessions from genotype and RNA file') accx_common = [accx for accx in SNP_accx if accx in RNA_accx] RNA = RNA_df.as_matrix(columns=accx_common).T # accession x genes match = lambda a, b: [b.index(x) if x in b else None for x in a] geno.filter_accessions_ix(match(accx_common, SNP_accx)) logger.info( 'Number of accessions: genotype file %d, RNA file %d, common %d', len(SNP_accx), len(RNA_accx), len(accx_common)) logger.info('Start building SNP matrix in memory') snps = np.vstack(geno.get_snps_iterator(is_chunked=True)) snps = snps.T.astype(int) logger.info('Finished') logger.info('Start loading kinship matrix from %s', kinship_hdf5) load_k = kinship.load_kinship_from_file(kinship_hdf5, scaled=False) K0 = load_k['k'].astype(float) K_accx = ['X' + acc for acc in load_k['accessions']] K_accx_ix = np.ix_(match(accx_common, K_accx), match(accx_common, K_accx)) K = K0[K_accx_ix] logger.info('Finished') logger.info('Start loading covariance from %s', COV_file) COV_df = pd.read_csv(COV_file, sep='\t', header=0, index_col=0) # cov x accessions COV = COV_df.ix[accx_common].as_matrix() logger.info('Finished') logger.info('Start association testing: RNA start %d, RNA end %d', RNA_start, RNA_end) run_lmm_chunk(snps, RNA, COV, RNA_start, RNA_end, list(RNA_genes[RNA_start:RNA_end]), K, out_file)
def main(): geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1kgen_filter3.hdf5' pheno_file = '/gale/netapp/home/shhuang/data/1001_genomes/seed_size/accx_size.hdf5' expr_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_normCounts_k4_vst2T.hdf5' out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/draw_seedsize_plots_2016-11-30' out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics')) graphics_prefix = os.path.join(out_graphics_dir, 'draw_seedsize_plots_2016-11-30-') results_prefix = os.path.join(out_dir, 'draw_seedsize_plots_2016-11-30-') logger = LoggerFactory.get_logger(os.path.join( out_dir, 'draw_seedsize_plots_2016-11-30.log'), file_level=logging.DEBUG, console_level=logging.DEBUG) logger.info('Loading genotype from %s', geno_file) geno_reader = gr.genotype_reader_tables(geno_file) logger.info('Loading phenotype from %s', pheno_file) pheno_reader = phr.pheno_reader_tables(pheno_file) pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID) logger.info('Loading expression from %s', expr_file) expr_reader = phr.pheno_reader_tables(expr_file) expr_reader.sample_ID = strip_xvec(expr_reader.sample_ID) # the data object allows to query specific genotype or phenotype data logger.info('Creating QTL dataset') dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader) exprset = data.QTLData(geno_reader=geno_reader, pheno_reader=expr_reader) # import data #phenotypes,sample_idx = dataset.getPhenotypes(intersection=False) pheno_sample_select = np.ones(pheno_reader.sample_ID.shape[0], dtype=bool) phenotypes, pheno_sample_idx = pheno_reader.getPhenotypes( sample_idx=pheno_sample_select) expr_sample_select = np.ones(expr_reader.sample_ID.shape[0], dtype=bool) expr, expr_sample_idx = expr_reader.getPhenotypes( sample_idx=expr_sample_select) snps = geno_reader.getGenotypes() position = geno_reader.getPos() position, chromBounds = data_util.estCumPos(position=position, offset=0) gid_start, gid_end = geno_reader.getGenoIndex(chrom=4, pos_start=(4, 13393142), pos_end=(4, 13393144)) gid_range = np.arange(gid_start, gid_end + 1) for ig, g_ID in enumerate(gid_range): g_ID = gid_range[ig:(ig + 1)] print(g_ID) gs_idx = dataset.sample_idx["geno"].values ps_idx = dataset.sample_idx["pheno"].values egs_idx = exprset.sample_idx["geno"].values eps_idx = exprset.sample_idx["pheno"].values snps_sub = snps[np.ix_(gs_idx, g_ID)][:, 0] phenotypes_sub = phenotypes.values[ps_idx] esnps_sub = snps[np.ix_(egs_idx, g_ID)][:, 0] fba5_sub = expr['AT4G26530'].values[eps_idx] position_sub = position.iloc[[g_ID[0]]] print(position_sub) point_file = graphics_prefix + 'point_chr%d_%d.png' % ( position_sub['chrom'], position_sub['pos']) fig = plt.figure(figsize=[5, 2.5]) #create the figure plt.subplot(1, 2, 1) plt.plot(snps_sub + 0.05 * np.random.randn(snps_sub.shape[0]), phenotypes_sub, '.') plt.xlabel("SNP") plt.ylabel("Seed size") plt.subplot(1, 2, 2) plt.plot(esnps_sub + 0.05 * np.random.randn(esnps_sub.shape[0]), fba5_sub, '.') plt.xlabel("SNP") plt.ylabel("FBA5 expression") plt.tight_layout() fig.savefig(point_file) plt.close(fig) bxp_file = graphics_prefix + 'bxp_chr%d_%d.png' % ( position_sub['chrom'], position_sub['pos']) fig = plt.figure(figsize=[5, 2.5]) #create the figure plt.subplot(1, 2, 1) phenotypes_box = [ phenotypes_sub[snps_sub == 0], phenotypes_sub[snps_sub == 2] ] plt.boxplot(phenotypes_box) plt.xlabel("SNP") plt.ylabel("Seed size") plt.subplot(1, 2, 2) fba5_box = [fba5_sub[esnps_sub == 0], fba5_sub[esnps_sub == 2]] plt.boxplot(fba5_box) plt.xlabel("SNP") plt.ylabel("FBA5 expression") plt.tight_layout() fig.savefig(bxp_file) plt.close(fig)