def plot(caseHDF5Name, controlHDF5Name, position): print("Plot theta of position %d" % (position)) ###################### plot position for case ################### caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(caseHDF5Name) casedelta = caseq['delta'] a = casedelta[0][position, 0] b = casedelta[0][position, 1] #print (a, b) fig, ax = plt.subplots() # display the pdf # ppf (percentage point function) is the inverse CDF. # median read depth of case file cov_case = int(np.median(caseN)) x_case = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax.plot(x_case, beta.pdf(x_case, a, b), 'b-', lw=4, alpha=0.8, label="Case, Depth=%d" % cov_case) # generate random variables r_case = beta.rvs(a, b, size=1000) ax.hist(r_case, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) ###################### plot position for control ################### controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( controlHDF5Name) controldelta = controlq['delta'] a = controldelta[0][position, 0] b = controldelta[0][position, 1] #print (a, b) # display the pdf # ppf (percentage point function) is the inverse CDF. cov_control = int(np.median(controlN)) x_control = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax.plot(x_control, beta.pdf(x_control, a, b), 'g-', lw=4, alpha=0.8, label='Control, Depth=%d' % cov_control) # generate random variables r_control = beta.rvs(a, b, size=1000) ax.hist(r_control, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) ax.set_title( '$\\beta$ variational distribution of $\\theta$ at position %d' % (position)) ax.set_xlabel('$\\theta$', fontsize=20) ax.set_ylabel('PDF', fontsize=18) # position_VAF_downsample #plt.savefig('%d.png' %(position)) plt.show()
def main(): dilutionList = (0.1, 0.3, 1.0, 10.0, 100.0) folder = '2015-09-28_Run_rvd3_synthetic_data_set/hdf5/10' fig = plt.figure(figsize=(12, 20)) #plt.suptitle('Read depth/M across position') controlFile = "../%s/Control.hdf5" % folder controlR, controlN, controlPhi, controlq, controlLoc, _ = rvd3.load_model( controlFile) sub0 = len(dilutionList) + 1 ax = fig.add_subplot(sub0, 2, 1) #TODO: use index of controlN rather than directly controlLOC controlLoc = [int(x.split(':')[1]) for x in controlLoc] ax.plot(controlLoc, controlN.T) ax.set_title('Control') ax.set_ylabel('Coverage') ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) ax = fig.add_subplot(sub0, 2, 2) ax.semilogy(controlLoc, np.mean(controlPhi['M'], axis=1)) ax.set_title('Control') ax.set_ylabel('M') #ax.set_ylim([1e-4,1e5]) ax.semilogy([controlLoc[0], controlLoc[-1]], [controlPhi['M0'], controlPhi['M0']], color='r', ls='--') for d in dilutionList: logging.debug("Processing dilution: %0.1f%%" % d) caseFile = "Case%s.hdf5" % str(d).replace(".", "_") caseFile = "../%(folder)s/%(file)s" % { 'folder': folder, 'file': caseFile } caseR, caseN, casePhi, caseq, caseLoc, _ = rvd3.load_model(caseFile) ax = fig.add_subplot(sub0, 2, 2 * dilutionList.index(d) + 3) caseLoc = [int(x.split(':')[1]) for x in caseLoc] ax.plot(caseLoc, caseN.T) ax.set_title("Dilution %0.1f%%" % d) if dilutionList.index(d) == len(dilutionList) - 1: ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0)) ax = fig.add_subplot(sub0, 2, 2 * dilutionList.index(d) + 4) ax.semilogy(caseLoc, np.mean(casePhi['M'], axis=1)) ax.set_title("Dilution %0.1f%%" % d) if dilutionList.index(d) == len(dilutionList) - 1: ax.set_xlabel('Position') ax.set_ylabel('M') #ax.set_ylim([1e-4,1e5]) ax.semilogy([caseLoc[0], caseLoc[-1]], [casePhi['M0'], casePhi['M0']], color='r', ls='--') plt.savefig('M_dsample=10.png')
def bayestest(caseHDF5Name, controlHDF5Name, position): alpha = 0.05 tau = 0 caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(caseHDF5Name) casegam = caseq['gam'] controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( controlHDF5Name) controlgam = controlq['gam'] def beta_mean(p): return p[0] * 1.0 / np.sum(p) def beta_var(p): s = np.sum(p) return p[0] * p[1] / (s**2 * (s + 1)) mu = (beta_mean(casegam[position, :]) - casephi['mu0']) - ( beta_mean(controlgam[position, :]) - controlphi['mu0']) sigma = beta_var(casegam[position, :]) + beta_var(controlgam[position, :]) z = (tau - mu) / sigma print(z) p = ss.norm.cdf(z) print(p[0])
def read(filename, pos): def beta_mean(p): return p[0] * 1.0 / np.sum(p) caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(filename) casegam = caseq['gam'] caseMu = beta_mean(casegam[pos, :]) - casephi['mu0'] # calculate the lower and upper credible value alpha = 0.05 cred = caseMu * alpha / 2 conf_l = caseMu - cred conf_u = caseMu + cred # calculate the error bar value err = np.array(cred, cred) print 100 * caseMu, conf_l, conf_u return 100 * caseMu, 100 * err
def main(): book=xlwt.Workbook(encoding="utf-8") sheet1=book.add_sheet("TPR_TNR") sheet1.write(0, 0, "VAF") sheet1.write(0, 1, "Median Depth") sheet2=book.add_sheet("Multi-measures") sheet2.write(1, 0, "VAF") sheet2.write(1, 1, "Median Depth") sheet3=book.add_sheet("FDR") sheet3.write(0, 0, "VAF") sheet3.write(0, 1, "Median Depth") sheet4=book.add_sheet("MCC") sheet4.write(0, 0, "VAF") sheet4.write(0, 1, "Median Depth") # method = {'RVD2(T*)(R=6)':'./../2013-12-20_experiment_set_gibbs_Qsd_mu_1_mu_over_10_minus_mu0/six_replicates_synthetic_optT/vcf/MCC', # 'RVD2(T*)(R=1)':'./../2013-12-20_experiment_set_gibbs_Qsd_mu_1_mu_over_10_minus_mu0/one_replicate_synthetic_optT/vcf/MCC', # 'RVD2(T=0)(R=6)':'./../2013-12-20_experiment_set_gibbs_Qsd_mu_1_mu_over_10_minus_mu0/six_replicates_synthetic_T0/vcf', # 'RVD2(T=0)(R=1)':'./../2013-12-20_experiment_set_gibbs_Qsd_mu_1_mu_over_10_minus_mu0/one_replicate_synthetic_T0/vcf', # 'VarScan2 somatic':'./../2013-09-23_SNP_calling_using_varscan2_somatic/vcf', # 'SAMtools':'./../2013-09-10_SNP_calling_using_samtools/vcf', # 'GATK':'./../2013-09-13_SNP_calling_using_GATK/vcf', # 'MuTect':'./../2013-10-02_SNP_calling_using_MuTect/work', # 'Strelka':'./../2013-10-01_SNP_calling_using_strelka/vcf', # 'VarScan2 mpileup':'./../2013-09-20_SNP_calling_using_varscan2/vcf'} # method = {'RVD2_MCMC(T=0,R=6)':'./../2013-12-20_experiment_set_gibbs_Qsd_mu_1_mu_over_10_minus_mu0/six_replicates_synthetic_T0/vcf', # 'MuTect':'./../2013-10-02_SNP_calling_using_MuTect/work', # 'RVD2_Var(T=0,R=6)':'./vcf' # } method = {'RVD3(T=0,R=6)':'./vcf',} DilutionList = (0.1, 0.3, 1.0, 10.0,100.0) DepthList = (10000, 1000, 100, 10) i=0 for k, v in method.iteritems(): i=i+1 print 'Method %(number)d: %(method)s' %{'number':i, 'method': k} sheet1.write(0, i+1, k) sheet2.write(0, 9*(i-1)+6, k) sheet3.write(0, i+1, k) sheet4.write(0, i+1, k) character=('Sensitiviy', 'Specificity', 'FPR', 'FNR', 'PPV', 'NPV', 'FDR', 'ACC', 'MCC') for j in xrange(9): sheet2.write(1,9*(i-1)+j+2,character[j]) for d in DilutionList: if i==1: sheet1.write(DilutionList.index(d)*len(DepthList)+1,0,"%0.1f%%" %d) sheet2.write(DilutionList.index(d)*len(DepthList)+2,0,"%0.1f%%" %d) sheet3.write(DilutionList.index(d)*len(DepthList)+1,0,"%0.1f%%" %d) sheet4.write(DilutionList.index(d)*len(DepthList)+1,0,"%0.1f%%" %d) for r in DepthList: # read in the median coverage #hdf5Dir='../2013-08-14_Compute_ROC_Synthetic_avg%s' %str(r) hdf5Dir = './hdf5/%s' %str(r) caseFile = 'Case%s.hdf5' %str(d).replace('.','_') caseFile = "%(dir)s/%(file)s" %{'dir':hdf5Dir,'file':caseFile} # pdb.set_trace() #(_, _, _, _, _, caseN,_) = rvd27.load_model(caseFile) (_, caseN, _, _, _, _) = rvd3.load_model(caseFile) cov = int(np.median(caseN)) # print the median coverage if i==1: sheet1.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+1, 1, "%s" % str(cov)) sheet2.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+2, 1, "%s" % str(cov)) sheet3.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+1, 1, "%s" % str(cov)) sheet4.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+1, 1, "%s" % str(cov)) # read in called positions from vcf files # pdb.set_trace() vcfFile=os.path.join(v,"%s" %r, "vcf%s.vcf" %str(d).replace('.','_')) logging.debug(vcfFile) vcf_reader = vcf.Reader(open(vcfFile, 'r')) # pdb.set_trace() callpos=np.array([record.POS for record in vcf_reader]) # prediction classification PredictClass = np.zeros(400) if len(callpos) != 0: PredictClass[callpos-1] = np.ones_like(callpos) # actual classification RefClass = np.zeros(400) pos = np.arange(85,346,20) RefClass[pos-1] = np.ones_like(pos) # characteristics computation [TPR, TNR, FPR, FNR, PPV, NPV, FDR, ACC, MCC]=characteristics(RefClass, PredictClass) ncharacter=(TPR, TNR, FPR, FNR, PPV, NPV, FDR, ACC, MCC) # print characteristics sheet1.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+1,i+1,"%(TPR)0.2f/%(TNR)0.2f" %{'TPR':TPR,'TNR':TNR}) for j in xrange(9): sheet2.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+2,9*(i-1)+j+2,'%0.2f' %ncharacter[j]) if not np.isnan(FDR): sheet3.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+1,i+1,"%0.2f" %FDR) if not np.isnan(FDR): sheet4.write(DilutionList.index(d)*len(DepthList)+DepthList.index(r)+1,i+1,"%0.2f" %MCC) book.save('statistics_no_chi2.xls')
def plot(case_10000, case_1000, case_100, case_10, control_10000, control_1000, control_100, control_10, position): print("Plot mu of position %d" % (position + 1)) fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2) xstep = 0.002 xmax = 0.0152 ymax = 1200 size = 15 ################ Downsample = 10000 ################################################# caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(case_10000) casegam = caseq['gam'] a, b = get_a_b(position, casegam, casephi['mu0']) cov_case = int(np.median(caseN)) x_case = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax1.plot(x_case, beta.pdf(x_case, a, b), 'b-', lw=5, alpha=0.8, label="Case") # generate random variables r_case = beta.rvs(a, b, size=1000) ax1.hist(r_case, normed=True, histtype='stepfilled', alpha=0.2) ax1.legend(loc='best', frameon=False) controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( control_10000) controlgam = controlq['gam'] a, b = get_a_b(position, controlgam, controlphi['mu0']) x_control = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax1.plot(x_control, beta.pdf(x_control, a, b), 'g-', lw=5, alpha=0.8, label='Control') # generate random variables r_control = beta.rvs(a, b, size=1000) ax1.hist(r_control, normed=True, histtype='stepfilled', alpha=0.2) ax1.legend(loc='best', frameon=False) ax1.set_title('Depth=%d' % cov_case, fontsize=size) xticks = np.arange(0, xmax, xstep) ax1.set_xticks(xticks) ax1.set_ylim(0, ymax) print('mu0^control:', controlphi['mu0']) print('mu0^case', casephi['mu0'], '\n') #################### Downsample = 1000 ############################################# caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(case_1000) casegam = caseq['gam'] a, b = get_a_b(position, casegam, casephi['mu0']) cov_case = int(np.median(caseN)) x_case = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax2.plot(x_case, beta.pdf(x_case, a, b), 'b-', lw=5, alpha=0.8, label="Case") # generate random variables r_case = beta.rvs(a, b, size=1000) ax2.hist(r_case, normed=True, histtype='stepfilled', alpha=0.2) ax2.legend(loc='best', frameon=False) controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( control_1000) controlgam = controlq['gam'] a, b = get_a_b(position, controlgam, controlphi['mu0']) x_control = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax2.plot(x_control, beta.pdf(x_control, a, b), 'g-', lw=5, alpha=0.8, label='Control') # generate random variables r_control = beta.rvs(a, b, size=1000) ax2.hist(r_control, normed=True, histtype='stepfilled', alpha=0.2) ax2.legend(loc='best', frameon=False) ax2.set_title('Depth=%d' % cov_case, fontsize=size) xticks = np.arange(0, xmax, xstep) ax2.set_xticks(xticks) ax2.set_ylim(0, ymax) print('mu0^control:', controlphi['mu0']) print('mu0^case', casephi['mu0'], '\n') ################### Downsample = 100 ############################################# caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(case_100) casegam = caseq['gam'] a, b = get_a_b(position, casegam, casephi['mu0']) cov_case = int(np.median(caseN)) x_case = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax3.plot(x_case, beta.pdf(x_case, a, b), 'b-', lw=5, alpha=0.8, label="Case") # generate random variables r_case = beta.rvs(a, b, size=1000) ax3.hist(r_case, normed=True, histtype='stepfilled', alpha=0.2) ax3.legend(loc='best', frameon=False) controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( control_100) controlgam = controlq['gam'] a, b = get_a_b(position, controlgam, controlphi['mu0']) x_control = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax3.plot(x_control, beta.pdf(x_control, a, b), 'g-', lw=5, alpha=0.8, label='Control') # generate random variables r_control = beta.rvs(a, b, size=1000) ax3.hist(r_control, normed=True, histtype='stepfilled', alpha=0.2) ax3.legend(loc='best', frameon=False) ax3.set_title('Depth=%d' % cov_case, fontsize=size) xticks = np.arange(0, xmax, xstep) ax3.set_xticks(xticks) ax3.set_xlabel('$\mu$', fontsize=size) ax3.set_ylim(0, ymax) print('mu0^control:', controlphi['mu0']) print('mu0^case', casephi['mu0'], '\n') #################### Downsample = 10 ############################################# caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(case_10) casegam = caseq['gam'] a, b = get_a_b(position, casegam, casephi['mu0']) cov_case = int(np.median(caseN)) x_case = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax4.plot(x_case, beta.pdf(x_case, a, b), 'b-', lw=5, alpha=0.8, label="Case") # generate random variables r_case = beta.rvs(a, b, size=1000) ax4.hist(r_case, normed=True, histtype='stepfilled', alpha=0.2) ax4.legend(loc='best', frameon=False) controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( control_10) controlgam = controlq['gam'] a, b = get_a_b(position, controlgam, controlphi['mu0']) x_control = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax4.plot(x_control, beta.pdf(x_control, a, b), 'g-', lw=5, alpha=0.8, label='Control') # generate random variables r_control = beta.rvs(a, b, size=1000) ax4.hist(r_control, normed=True, histtype='stepfilled', alpha=0.2) ax4.legend(loc='best', frameon=False) ax4.set_title('Depth=%d' % cov_case, fontsize=size) xticks = np.arange(0, xmax, xstep) ax4.set_xticks(xticks) ax4.set_xlabel('$\mu$', fontsize=size) ax4.set_ylim(0, ymax) print('mu0^control:', controlphi['mu0']) print('mu0^case', casephi['mu0'], '\n') plt.suptitle( '$\\beta$ variational distribution of $\mu$ at position %d when VAF=1.0%% ' % (position + 1), fontsize=size) # manually adjust the spacing of suptitle plt.subplots_adjust(top=0.9) #plt.tight_layout(fig, rect=[0, 0.03, 1, 0.95]) #plt.show() fig = plt.gcf() fig.set_size_inches(12, 8) plt.savefig('mu_%d_VAF=1.0.png' % (position + 1))
def plot(f2HDF5Name, f1HDF5Name, f3HDF5Name, f4HDF5Name, position): ###################### plot position for f2 ################### f2R, f2N, f2phi, f2q, loc, refb = rvd3.load_model(f2HDF5Name) f2gam = f2q['gam'] a = f2gam[position, 0] b = f2gam[position, 1] #print (a, b) M0_f2 = f2phi['M0'] fig, ax = plt.subplots() x_f2 = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax.plot(x_f2, beta.pdf(x_f2, a, b), 'g--', lw=8, alpha=0.8, label="M0=%.4f" % M0_f2) ###################### plot position for f1 ################### f1R, f1N, f1phi, f1q, _, _ = rvd3.load_model(f1HDF5Name) f1gam = f1q['gam'] a = f1gam[position, 0] b = f1gam[position, 1] M0_f1 = f1phi['M0'] x_f1 = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax.plot(x_f1, beta.pdf(x_f1, a, b), 'b-', lw=8, alpha=0.8, label='M0=%.3f' % M0_f1) ###################### plot position for f3 ################### f3R, f3N, f3phi, f3q, _, _ = rvd3.load_model(f3HDF5Name) f3gam = f3q['gam'] a = f3gam[position, 0] b = f3gam[position, 1] M0_f3 = f3phi['M0'] x_f3 = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax.plot(x_f3, beta.pdf(x_f3, a, b), 'm-.', lw=8, alpha=0.8, label='M0=%.2f' % M0_f3) ###################### plot position for f4 ################### f4R, f4N, f4phi, f4q, _, _ = rvd3.load_model(f4HDF5Name) f4gam = f4q['gam'] a = f4gam[position, 0] b = f4gam[position, 1] M0_f4 = f4phi['M0'] x_f4 = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) ax.plot(x_f4, beta.pdf(x_f4, a, b), 'yo:', lw=8, alpha=0.8, label='M0=%.1f' % M0_f4) legend = ax.legend(loc='upper left') for label in legend.get_texts(): label.set_fontsize(38) ax.set_xlabel('$\hat{\mu}_{1,014,740}$', fontsize=38) ax.set_xlim([0.95, 1]) plt.setp(plt.gca().get_xticklabels(), fontsize=35) plt.setp(plt.gca().get_yticklabels(), fontsize=35) plt.show()
def main(): ################### Read mu of MCMC (rvd2) ################################ with h5py.File(control_mcmc, 'r') as f: muControl = f['mu'][...] locControl = f['loc'][...] with h5py.File(case_mcmc, 'r') as f: muCase = f['mu'][...] locCase = f['loc'][...] idx = [] for pos in position: idx.append(pos) muControl1 = muControl[idx] muCase1 = muCase[idx] #N = 2000 #(muZ,_,_) =rvd27.sample_post_diff(muCase1, muControl1, N) # sample Z ## plot histogram num_bins = 25 for i in xrange(len(position)): fig = plt.figure(figsize=(12, 8)) ########### Plot mu of MCMC (rvd2) vs Variational (rvd3) ################## # normed=True, the integral of the histogram will sum to 1. plt.hist(muCase1[i, :].T, num_bins, normed=True, facecolor='r', alpha=0.5, label='Case (MCMC)') plt.hist(muControl1[i, :].T, num_bins, normed=True, facecolor='k', alpha=0.5, label='Control (MCMC)') ############# Plot mu of Variational (rvd3) ################################ caseR, caseN, casephi, caseq, loc, refb = rvd3.load_model(case_var) casegam = caseq['gam'] a = casegam[position, 0] b = casegam[position, 1] cov_case = int(np.median(caseN)) x_case = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) plt.plot(x_case, beta.pdf(x_case, a, b), 'r--', lw=4, alpha=1.0, label="Case (Variational)") r_case = beta.rvs(a, b, size=2000) plt.hist(r_case, num_bins, normed=True, histtype='stepfilled', alpha=0.2, facecolor='r') controlR, controlN, controlphi, controlq, _, _ = rvd3.load_model( control_var) controlgam = controlq['gam'] a = controlgam[position, 0] b = controlgam[position, 1] x_control = np.linspace(beta.ppf(0.001, a, b), beta.ppf(0.999, a, b), 100) plt.plot(x_control, beta.pdf(x_control, a, b), 'k--', lw=4, alpha=1.0, label='Control (Variational)') r_control = beta.rvs(a, b, size=2000) plt.hist(r_control, num_bins, normed=True, histtype='stepfilled', alpha=0.2, facecolor='k') plt.xlim(0, 0.012) plt.legend(loc='best', frameon=False) plt.xlabel('$\hat{\mu} = \mu-\mu_0$', fontsize=20) plt.xticks(rotation=25) plt.title('$\hat{\mu}$ at position %s when median depth is %d' % ((position[i] + 1), cov_case), fontsize=18) plt.xticks(rotation=25) plt.savefig('position_%s_%d_mcmc_vs_var.png' % ((position[i] + 1), cov_case)) plt.tight_layout()
def ROCpoints(controlFile,caseFile, d, N, P, chi2): # Load the model samples controlR, controlN, controlphi, controlq, controlLoc, _ = rvd3.load_model(controlFile) controlgam = controlq['gam'] caseR, caseN, casephi, caseq, caseLoc, refb = rvd3.load_model(caseFile) casegam = caseq['gam'] #(N,J) = np.shape(caseR)[0:2] J = len(controlLoc) def beta_mean(p): return p[0]*1.0/np.sum(p) def beta_var(p): s = np.sum(p) return p[0]*p[1]/(s**2*(s+1)) # Draw random samples from Beta distribution controlMu = np.zeros(shape=(J, 4000)) caseMu = np.zeros(shape=(J, 4000)) for j in xrange(J): controlMu[j] = np.random.beta(controlgam[j,:][0], controlgam[j,:][1], 4000) caseMu[j] = np.random.beta(casegam[j,:][0], casegam[j,:][1], 4000) # Extract the common locations in case and control caseLocIdx = [i for i in xrange(len(caseLoc)) if caseLoc[i] in controlLoc] controlLocIdx = [i for i in xrange(len(controlLoc)) if controlLoc[i] in caseLoc] caseMu = caseMu[caseLocIdx,:] controlMu = controlMu[controlLocIdx,:] # caseR = caseR[:,caseLocIdx,:] # controlR = controlR[:,controlLocIdx,:] # caseN = caseN[:,caseLocIdx] # controlN = controlN[:,controlLocIdx] loc = caseLoc[caseLocIdx] J = len(loc) pos = np.arange(85,346,20) posidx = [i for i in xrange(J) if int(loc[i][8:]) in pos] # Sample from the posterior Z = muCase - muControl (Z, caseMuS, controlMuS) = sample_post_diff(caseMu-casephi['mu0'], controlMu-controlphi['mu0'], N) # Compute cumulative posterior probability for regions (Threshold,np.inf) T = np.linspace(np.min(np.min(Z)), np.max(np.max(Z)), num=300) pList = [bayes_test(Z, [(t, np.inf)]) for t in T] # mutation classification clsList = np.array((np.array(pList)>P).astype(int)) clsList = clsList.reshape((clsList.shape[0],clsList.shape[1]))# category list # chi2 test for goodness-of-fit to a uniform distribution for non-ref bases if chi2: nRep = caseR.shape[0] chi2Prep = np.zeros((J,nRep)) chi2P = np.zeros(J) for j in xrange(J): chi2Prep[j,:] = np.array([rvd3.chi2test( caseR[i,j,:] ) for i in xrange(nRep)] ) if np.any(np.isnan(chi2Prep[j,:])): chi2P[j] = 1 else: chi2P[j] = 1-ss.chi2.cdf(-2*np.sum(np.log(chi2Prep[j,:] + np.finfo(float).eps)), 2*nRep) # combine p-values using Fisher's Method clsList2 = np.array((np.array(chi2P)<0.05/J).astype(int)) clsList2 = np.tile(clsList2,(clsList.shape[0],1)) clsList = np.array(((clsList+clsList2)==2).astype(int)) # false postive rate fpr = np.array([float(sum(clsList[i])-sum(clsList[i,np.array(posidx)]))/(clsList.shape[1]-len(posidx)) for i in xrange(clsList.shape[0])]) # true positive rate tpr = np.array([float(sum(clsList[i,np.array(posidx)]))/len(posidx) for i in xrange(clsList.shape[0])]) cov = np.median(caseN) # # return information for mu bar plot at called positions under optimal threshold. # # using EL distance. # ## distance=np.sum(np.power([fpr,tpr-1],2),0) # ## Tidx=distance.argmin() # ## print Tidx # # Using L1 distance # distance = 1+tpr-fpr # Tidx=distance.argmax() # outputFile=os.path.join(path,'vcf%s.vcf' %str(d).replace('.','_')) # with h5py.File(controlFile, 'r') as f: # refb = f['/refb'][...] # f.close() # refb = refb[controlLocIdx] # altb = [] # call=[] # acgt = {'A':0, 'C':1, 'G':2, 'T':3} # for i in xrange(J): # r = np.squeeze(caseR[:,i,:]) # replicates x bases # # Make a list of the alternate bases for each replicate # acgt_r = ['A','C','G','T'] # del acgt_r[ acgt[refb[i]] ] # altb_r = [acgt_r[x] for x in np.argmax(r, axis=1)] # if clsList[Tidx,i]==1: # call.append(True) # altb.append(altb_r[0]) # else: # altb.append(None) # call.append(False) # rvd30.write_vcf(outputFile, loc, call, refb, altb, np.mean(caseMu, axis=1), np.mean(controlMu, axis=1)) return fpr,tpr, cov