def plot_length_distribution( pop, coverage, d_lengths, ): for key in d_lengths.keys(): fn = 'lengths_%s%s_%s' % (pop, coverage, key) fd = open(fn, 'w') fd.write(d_lengths[key]) fd.close() gnuplot.histogram2( fn, x_step=1, x_max=12, xlabel='INDEL length', ylabel='INDEL count', color='blue', title=key, ) ## for key in d_lengths.keys(): ## ## x = d_lengths[key].split('\n') ## ## r.png('lengths_%s_%s.png' %(pop,key)) ## r.hist(x, main='A histogram', xlab='x', col='lightblue') ## r.dev_off() return
def plot_length_distribution(pop,coverage,d_lengths,ts_filter_level,): for key in d_lengths.keys(): fn = 'lengths_%s%s_%4.1f_%s' %(pop,coverage,ts_filter_level,key) fd = open(fn,'w') fd.write(d_lengths[key]) fd.close() gnuplot.histogram2( fn, x_step=1, x_max=12, xlabel='INDEL length', ylabel='INDEL count', color = 'blue', title = key, ) ## for key in d_lengths.keys(): ## ## x = d_lengths[key].split('\n') ## ## r.png('lengths_%s_%s.png' %(pop,key)) ## r.hist(x, main='A histogram', xlab='x', col='lightblue') ## r.dev_off() return
def frq_discordant(): for chip in ['quad','octo',]: cmd = 'cat pops/Baganda_%s/Baganda_%s.SNPQC.fam | wc -l' %(chip,chip,) n_samples = int(os.popen(cmd).read()) for flag,prefix in [ ['','discordant',], ['-v','concordant',], ]: for suffix,column,x_min,x_max in [ ['frq','$5',0,0.5,], ['lmiss','1-$5',0.90,1.00,], ]: cmd = 'fgrep %s -w -f discordant.SNPs pops/Baganda_%s/Baganda_%s.SNPQC.%s' %( flag,chip,chip,suffix,) cmd += " | awk '{print %s}' > %s.%s.%s" %( column,suffix,chip,prefix,) execmd(cmd) cmd = 'cat %s.%s.%s | wc -l' %(suffix,chip,prefix,) n_SNPs = int(os.popen(cmd).read()) if flag == '-v': n_SNPs -= 1 gnuplot.histogram2( '%s.%s.%s' %(suffix,chip,prefix,), x_step = 0.01, x_min = x_min, x_max = x_max, xlabel='MAF after sample QC', title='Baganda %s (n_{samples}=%i, n_{SNPs}=%i)' %( chip,n_samples,n_SNPs, ), ) return
def count_and_plot(): chromosome = '1' import time t1 = time.time() for l_fp_in in [ ['out_GATK/join/CombineVariants.vcf'], ['out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf'], ['SelectVariants_discordance1.vcf'], ['SelectVariants_discordance2.vcf'], ['%s.vqsr.filt.vcf' %(chromosome) for chromosome in range(1,23)+['X','Y',]], ['SelectVariants_concordance.vcf'], ]: import time t1 = time.time() ## ## prepare scatter lists ## l_gnuplot_MAF = [] l_gnuplot_DP = [] l_gnuplot_CR = [] ## ## prepare contour dic ## d_contour = {} for AF in xrange(100+1): d_contour[AF*0.01] = {} for DP in xrange(150+1): d_contour[AF*0.01][DP*10.] = 0 d_contour_CR = {} for AF in xrange(100+1): d_contour_CR[round(AF*0.01,2)] = {} for CR in xrange(100+1): d_contour_CR[round(AF*0.01,2)][CR] = 0 for fp_in in l_fp_in: print fp_in fd = open(fp_in,'r') print fp_in for line in fd: if line[0] == '#': continue ## if line.count('./.')+line.count('0/0')+line.count('0/1')+line.count('1/1') != 100: ## print line ## stop CHROM, d_INFO, bool_continue = parse_line(line) if bool_continue == True: continue CR = 100-line.count('./.') DP = int(d_INFO['DP']) try: AF = float(d_INFO['AF']) except: d_INFO['AF'] AF = 'N/A' if AF < 0.5: MAF = AF else: MAF = 1-AF ## ## append to list ## l_gnuplot_DP += [DP] l_gnuplot_CR += [CR] if fp_in != 'mp15_vqsr.vcf': if AF == 'N/A': stop l_gnuplot_MAF += [MAF] if DP < 1500: d_contour[0.01*round(MAF/0.01,0)][10.*round(DP/10.,0)] += 1 ## ## append to dic ## d_contour_CR[round(MAF,2)][CR] += 1 if chromosome != CHROM: t2 = time.time() print fp_in, '%-2s' %(chromosome), '%2is' %(int(t2-t1)) chromosome = CHROM t1 = t2 ## if CHROM == '2': ## break ## if POS[-1] == '0' and POS[-2] == '0' and POS[-3] == '0' and POS[-4] == '0': ## print '%2s %9s %6s %4s' %(CHROM, POS, AF, DP,), fp_in ## break ## if POS[-1] == '0' and POS[-2] == '0' and POS[-3] == '0' and POS[-4] == '0': ## print '%2s %9s %6s %4s' %(CHROM, POS, AF, DP,), fp_in title = fp_in.replace('_','').replace('out_GATK/','').replace('.vcf','') suffix = fp_in.replace('out_GATK','').replace('/','').replace('.vcf','') gnuplot.histogram2( 'DP_%s' %(suffix), l_data=l_gnuplot_DP, x_step=10,x_min=0,x_max=1000,tic_step=100, xlabel='DP from VCF', ylabel='SNP count', title= title, ) gnuplot.histogram2( 'CR_%s' %(suffix), l_data=l_gnuplot_CR, x_step=1,x_min=0,x_max=100,tic_step=10, xlabel='SNP Call Rate', ylabel='SNP count', title= title, ) if fp_in != 'mp15_vqsr.vcf': gnuplot.histogram2( 'AF_%s' %(suffix), l_data=l_gnuplot_MAF, x_min=0,x_max=.5,tic_step=0.05,x_step=0.01, xlabel='AF from VCF', ylabel='SNP count', title = title, ) lines = [] for AF in xrange(50+1): for DP in xrange(150+1): lines += ['%s %s %s\n' %(AF*0.01,DP*10.,d_contour[AF*0.01][DP*10.],)] lines += ['\n'] gnuplot.contour_plot( 'AFvDP_%s' %(suffix), lines, title = title, xlabel = 'AF from VCF', ylabel = 'DP from VCF', zlabel = 'count', ) lines = [] for AF in xrange(50+1): for CR in xrange(100+1): lines += ['%s %s %s\n' %(AF*0.01,CR,d_contour_CR[round(AF*0.01,2)][CR],)] lines += ['\n'] gnuplot.contour_plot( 'AFvCR_%s' %(suffix), lines, title = title, xlabel = 'AF from VCF', ylabel = 'Call Rate from VCF', zlabel = 'count', ) ## t2 = time.time() ## print t2-t1 ## stop return
def count_unique_and_intersect_vqsr( fp_vcf_individual,fp_vcf_combined,fp_map, suffix, bool_combined1 = False, bool_combined2 = True, bool_ignore_FILTER1 = False, ): '''this function assumes that markers in the VCFs are sorted''' ## set file paths fp2_template = fp2 = fp_vcf_combined fp1_template = fp1 = fp_vcf_individual fp3 = fp_map print fp1 print fp2 print fp3 ## set list of chromosomes (genotype array only contains autosomal SNPs) l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',] ## set initial chromosome chromosome1 = l_chromosomes[0] chromosome2 = l_chromosomes[0] ## chromosome1 = chromosome2 = '22' fp1 = fp1.replace('$CHROMOSOME',chromosome1,) fp2 = fp2.replace('$CHROMOSOME',chromosome2,) ## set booleans before loop bool_read1 = True bool_read2 = True bool_read3 = True bool_EOF1 = False bool_EOF2 = False bool_EOF3 = False ## set counters before loop count_intersect12 = 0 count_intersect13 = 0 count_intersect23 = 0 count1 = 0 count2 = 0 count3 = 0 count_intersect123 = 0 l_AF13 = [] l_AF23 = [] l_AF3 = [] l_AF123 = [] ## open files before loop fd1 = open(fp1,'r') fd2 = open(fp2,'r') fd3 = open(fp3,'r') fd3b = open('../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.tped','r') #### fd1.seek(1500000000000) #### s = fd1.readline() #### s = fd1.readline() ## fd2.seek(65000000000) ## s = fd2.readline() ## s = fd2.readline() ## fd3.seek(50000000) ## s = fd3.readline() ## s = fd3.readline() i = 0 while True: i += 1 if i % 10000 == 0: print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3, '|', bool_read1, bool_read2, bool_read3, '|', bool_EOF1, bool_EOF2, bool_EOF3, i/10000 #### if CHROM1 != None and CHROM2 != None and CHROM3 != None: #### if abs(l_chromosomes.index(CHROM1)-l_chromosomes.index(CHROM2)) > 1: #### print CHROM1, CHROM2 #### stop1 #### if abs(l_chromosomes.index(CHROM2)-l_chromosomes.index(CHROM3)) > 1: #### print CHROM2, CHROM3 #### stop2 ## if bool_read2 == True and count2-(count_intersect12+count_intersect23+count_intersect123) > 0: ## print ## print 2, fp2 ## print CHROM2, '***', POS2, '***' ## print count2-(count_intersect12+count_intersect23+count_intersect123) ## print count2, count_intersect12, count_intersect23, count_intersect123 ## print fp1, chromosome1 ## print fp2, chromosome2 ## stop2 #### if bool_read1 == True and count1-(count_intersect12+count_intersect23+count_intersect123) > 0: #### print #### print 1, fp1 #### print CHROM1, '***', POS1, '***' #### print count1-(count_intersect12+count_intersect13+count_intersect123) #### print count1, count_intersect12, count_intersect13, count_intersect123 #### print fp1, chromosome1 #### print fp2, chromosome2 #### stop1 if bool_read1 == True: if bool_combined1 == True: CHROM1, POS1, count1, bool_EOF1 = loop_single_vcf(fd1,count1,) else: ( CHROM1, POS1, count1, fd1, bool_EOF1, chromosome1, ) = loop_multiple_vcf( fd1,count1,fp1_template,chromosome1,l_chromosomes, bool_ignore_FILTER=bool_ignore_FILTER1, ) bool_read1 = False if bool_read2 == True: if bool_combined2 == True: CHROM2, POS2, count2, bool_EOF2 = loop_single_vcf(fd2,count2,) else: ( CHROM2, POS2, count2, fd2, bool_EOF2, chromosome2, ) = loop_multiple_vcf( fd2,count2,fp2_template,chromosome2,l_chromosomes, ) bool_read2 = False if bool_read3 == True: line3 = fd3.readline() if line3 == '': bool_EOF3 = True CHROM3 = None POS3 = None else: l3 = line3.split() CHROM3 = l3[0] POS3 = int(l3[3]) count3 += 1 ## tmp AF line3b = fd3b.readline() l = line3b.split()[4:] AF = l.count(l[0])/184. if AF > 0.5: AF = 1-AF l_AF3 += [AF] bool_read3 = False if bool_EOF1 == True and bool_EOF2 == True and bool_EOF3 == True: break ## print POS1, POS2, POS3, CHROM1, CHROM2, CHROM3 ## stop ## doing nested if statements is the fastest method of comparison ## looping over lines simultaneously to avoid reading all markers into memory ## ## triple intersection ## ## if POS1 == POS2 == POS3 and CHROM1 == CHROM2 == CHROM3: if POS1 == POS2 == POS3: if POS1 % 1000 == 0: print CHROM1, '%6i' %(POS1/1000), '|', print '%8i' %(count_intersect12), '%8i' %(count_intersect13), '%8i' %(count_intersect23), '|', print '%8i' %(count_intersect123), '|', print '%8i' %(count1), '%8i' %(count2), '%8i' %(count3) ## count_intersection12 += 1 ## count_intersection13 += 1 ## count_intersection23 += 1 count_intersect123 += 1 l_AF123 += [AF] bool_read1 = True bool_read2 = True bool_read3 = True ## ## double intersection ## else: ## it is faster to do nesting of logical statements ## when comparing long integers if POS1 == POS2 != None: if bool_EOF3 == True: count_intersect12 += 1 bool_read1 = True bool_read2 = True elif CHROM1 == CHROM3 and POS1 < POS3: count_intersect12 += 1 bool_read1 = True bool_read2 = True elif CHROM1 == CHROM3: bool_read3 = True else: if l_chromosomes.index(CHROM3) < l_chromosomes.index(CHROM1): bool_read3 = True else: count_intersect12 += 1 bool_read1 = True bool_read2 = True elif POS1 == POS3 != None: if bool_EOF2 == True: count_intersect13 += 1 l_AF13 += [AF] bool_read1 = True bool_read3 = True elif CHROM1 == CHROM2 and POS1 < POS2: count_intersect13 += 1 l_AF13 += [AF] bool_read1 = True bool_read3 = True elif CHROM1 == CHROM2: bool_read2 = True else: print CHROM1, CHROM2 stop2 elif POS2 == POS3 != None: if bool_EOF1 == True: count_intersect23 += 1 l_AF23 += [AF] bool_read2 = True bool_read3 = True elif CHROM1 == CHROM2 and POS2 < POS1: count_intersect23 += 1 l_AF23 += [AF] bool_read2 = True bool_read3 = True elif CHROM1 == CHROM2: bool_read1 = True else: if l_chromosomes.index(CHROM1) < l_chromosomes.index(CHROM2): bool_read1 = True else: count_intersect23 += 1 bool_read2 = True bool_read3 = True stop3tmp_wegethereornot ## ## no intersection ## else: ## different chromosomes if ( (CHROM1 != CHROM2) or (bool_EOF3 == False and CHROM2 != CHROM3) ): l_indexes= [] if bool_EOF1 == False: index1 = l_chromosomes.index(CHROM1) l_indexes += [index1] if bool_EOF2 == False: index2 = l_chromosomes.index(CHROM2) l_indexes += [index2] if bool_EOF3 == False: index3 = l_chromosomes.index(CHROM3) l_indexes += [index3] min_index = min(l_indexes) if bool_EOF1 == False and index1 == min_index: bool_read1 = True if bool_EOF2 == False and index2 == min_index: bool_read2 = True if bool_EOF3 == False and index3 == min_index: bool_read3 = True ## same chromosome else: ## either read 1 or 3 if CHROM1 == CHROM2 and POS1 < POS2: if bool_EOF3 == True or POS1 < POS3: bool_read1 = True else: bool_read3 = True elif bool_EOF3 == True: bool_read2 = True ## either read 2 or 3 elif CHROM2 == CHROM3: if bool_EOF3 == True or POS2 < POS3: bool_read2 = True else: bool_read3 = True else: print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3 stop print count1 print count2 print count3 print count_intersect12 print count_intersect13 print count_intersect23 print count_intersect123 print print count1-count_intersect12-count_intersect13-count_intersect123 print count2-count_intersect12-count_intersect23-count_intersect123 print count3-count_intersect13-count_intersect23-count_intersect123 print print fp1 print fp2 print fp3 print 'AF3', sum(l_AF3)/len(l_AF3) print 'AF13', sum(l_AF13)/len(l_AF13) print 'AF23', sum(l_AF23)/len(l_AF23) print 'AF123', sum(l_AF123)/len(l_AF123) gnuplot.histogram2( 'AF3',title='MAF distribution - 2.5M chip array',l_data=l_AF3, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF13',title='MAF distribution - 2.5M chip array and HGI SNPs',l_data=l_AF13, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF23',title='MAF distribution - 2.5M chip array and GATK SNPs',l_data=l_AF23, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF123',title='MAF distribution - 2.5M chip array and HGI and GATK SNPs',l_data=l_AF123, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) l_AF3 = [str(f) for f in l_AF3] l_AF13 = [str(f) for f in l_AF13] l_AF23 = [str(f) for f in l_AF23] l_AF123 = [str(f) for f in l_AF123] fd = open('AF3.txt','w') fd.write('\n'.join(l_AF3)) fd.close() fd = open('AF13.txt','w') fd.write('\n'.join(l_AF13)) fd.close() fd = open('AF23.txt','w') fd.write('\n'.join(l_AF23)) fd.close() fd = open('AF123.txt','w') fd.write('\n'.join(l_AF123)) fd.close() gnuplot.venn3( i1 = count1-count_intersect12-count_intersect13-count_intersect123, i2 = count2-count_intersect12-count_intersect23-count_intersect123, i3 = count3-count_intersect13-count_intersect23-count_intersect123, i4 = count_intersect12, i5 = count_intersect13, i6 = count_intersect23, i7 = count_intersect123, text1 = '%s' %(fp1), text2 = '%s' %(fp2), text3 = '%s' %(fp3), suffix = suffix, ) return
def main(): ## bsub -J"count$count" -o count$count.out -e count$count.err python ~/github/ms23/analysis/count_passed_variants.py $count ## bsub -M500000 -R'select[mem>500] rusage[mem=500]' -J"count$count" -o count$count.out -e count$count.err python ~/github/ms23/analysis/count_passed_variants.py $count ## fd = open('AF3.txt','r') ## lines = fd.readlines() ## fd.close() ## l_AF3 = [float(s) for s in lines] ## fd = open('AF13.txt','r') ## lines = fd.readlines() ## fd.close() ## l_AF13 = [float(s) for s in lines] ## fd = open('AF23.txt','r') ## lines = fd.readlines() ## fd.close() ## l_AF23 = [float(s) for s in lines] ## fd = open('AF123.txt','r') ## lines = fd.readlines() ## fd.close() ## l_AF123 = [float(s) for s in lines] ## ## import collections ## AF13_multiset = collections.Counter(l_AF13) ## AF23_multiset = collections.Counter(l_AF23) ## AF123_multiset = collections.Counter(l_AF123) ## AF3_multiset = collections.Counter(l_AF3) ## print len(l_AF3) ## l_AF3 = list((AF3_multiset - AF13_multiset).elements()) ## print len(l_AF3), len(l_AF13) ## AF3_multiset = collections.Counter(l_AF3) ## l_AF3 = list((AF3_multiset - AF23_multiset).elements()) ## print len(l_AF3), len(l_AF23) ## AF3_multiset = collections.Counter(l_AF3) ## l_AF3 = list((AF3_multiset - AF123_multiset).elements()) ## print len(l_AF3), len(l_AF123) ## stop ## #### print 'a' #### for x in l_AF123: #### l_AF3.remove(x) #### print 'b' #### for x in l_AF12: #### l_AF3.remove(x) #### print 'c' #### for x in l_AF13: #### l_AF3.remove(x) gnuplot.histogram2( 'AF3',title='MAF distribution - 2.5M chip array',l_data=l_AF3, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF13',title='MAF distribution - 2.5M chip array and HGI SNPs',l_data=l_AF13, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF23',title='MAF distribution - 2.5M chip array and GATK SNPs',l_data=l_AF23, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF123',title='MAF distribution - 2.5M chip array and HGI and GATK SNPs',l_data=l_AF123, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) stop if sys.argv[-1] == '5': ## 5) compare mp15 steps fp1 = 'out_mp15/beagle/03.merged.vcf' fp2 = 'out_mp15/impute2/$CHROMOSOME' fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map' count_unique_and_intersect_vqsr( fp1,fp2,fp3, 'mp15_BEAGLE_vs_IMPUTE2', bool_combined1 = True, bool_combined2 = False, ) return elif sys.argv[-1] == '4': ## 4) compare mp15 steps fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf' fp2 = 'out_mp15/beagle/03.merged.vcf' fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map' count_unique_and_intersect_vqsr( fp1,fp2,fp3, 'mp15_VQSR_vs_BEAGLE', bool_combined1 = False, bool_combined2 = True, ) return elif sys.argv[-1] == '3': ## 3) compare mp15 steps ## fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vcf' fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vqsr.vcf' fp2 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf' fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map' count_unique_and_intersect_vqsr( fp1,fp2,fp3, 'mp15_pre-VQSR_vs_post-VQSR', bool_combined2 = False, bool_combined1 = False, bool_ignore_FILTER1 = True, ) return elif sys.argv[-1] == '2': ## 2) compare tc9 steps fp1 = 'out_GATK/join/CombineVariants.vcf' fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf' fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map' count_unique_and_intersect_vqsr( fp1,fp2,fp3, 'tc9_pre-VQSR_vs_post-VQSR', bool_combined1 = True, bool_combined2 = True, ) return elif sys.argv[-1] == '1': ## 1) compare post-VQSR fp2 = 'out_GATK/join/ApplyRecalibration.recalibrated.filtered.vcf' fp1 = 'out_mp15/vqsr/$CHROMOSOME.vqsr.filt.vcf' fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map' count_unique_and_intersect_vqsr( fp1,fp2,fp3, 'post-VQSR_tc9_vs_mp15', bool_combined2 = True, bool_combined1 = False, ) return elif sys.argv[-1] == '0': ## 1) compare pre-VQSR ## fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vcf' fp1 = 'out_mp15/pre-vqsr/$CHROMOSOME.vqsr.vcf' fp2 = 'out_GATK/join/CombineVariants.vcf' fp3 = '../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.map' count_unique_and_intersect_vqsr( fp1,fp2,fp3, 'pre-VQSR_tc9_vs_mp15', bool_combined2 = True, bool_combined1 = False, bool_ignore_FILTER1 = True, ) return ## t1 = time.time() ## singlevcf_vs_multiplevcfs() ## t2 = time.time() ## print 'time', t2-t1 count_unique_and_intersect_impute2() stop count_and_plot() return