sys.path.append('/nfs/users/nfs_t/tc9/github/tc9/misc') import gnuplot bim4 = 'omni2.5-4_20120904_agv_gtu.bim' bim8 = 'omni2.5-8_agv_20120910_gtu.bim' strand8 = 'HumanOmni2.5-8v1_A-b37.strand' strand437 = 'HumanOmni2.5M-b37-v2.strand' strand436 = 'HumanOmni2.5M-b37-v2.strand' gnuplot.venn3( i1=1,i2=1,i3=1,i4=1,i5=1,i6=1,i7=1, text1='%s %i' %(strand436,2449906,), text2='%s %i' %(strand437,2449626,), text3='%s %i' %(strand8,2379514,), sum1 = 280, sum2 = 0, sum3 = 2376802, sum4 = ) d_count = {} for fn in (bim4,bim8,strand436,strand437,strand8): cmd = 'cat %s | wc -l' %(fn) i = int(os.popen(cmd).read()) d_count[fn] = i ## show that strand files are subsets of bim files for fn1,fn2 in [ [strand436,bim4,], [strand437,bim4,],
def count_unique_and_intersect_vqsr( fp_vcf_individual,fp_vcf_combined,fp_map, suffix, bool_combined1 = False, bool_combined2 = True, bool_ignore_FILTER1 = False, ): '''this function assumes that markers in the VCFs are sorted''' ## set file paths fp2_template = fp2 = fp_vcf_combined fp1_template = fp1 = fp_vcf_individual fp3 = fp_map print fp1 print fp2 print fp3 ## set list of chromosomes (genotype array only contains autosomal SNPs) l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',] ## set initial chromosome chromosome1 = l_chromosomes[0] chromosome2 = l_chromosomes[0] ## chromosome1 = chromosome2 = '22' fp1 = fp1.replace('$CHROMOSOME',chromosome1,) fp2 = fp2.replace('$CHROMOSOME',chromosome2,) ## set booleans before loop bool_read1 = True bool_read2 = True bool_read3 = True bool_EOF1 = False bool_EOF2 = False bool_EOF3 = False ## set counters before loop count_intersect12 = 0 count_intersect13 = 0 count_intersect23 = 0 count1 = 0 count2 = 0 count3 = 0 count_intersect123 = 0 l_AF13 = [] l_AF23 = [] l_AF3 = [] l_AF123 = [] ## open files before loop fd1 = open(fp1,'r') fd2 = open(fp2,'r') fd3 = open(fp3,'r') fd3b = open('../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.tped','r') #### fd1.seek(1500000000000) #### s = fd1.readline() #### s = fd1.readline() ## fd2.seek(65000000000) ## s = fd2.readline() ## s = fd2.readline() ## fd3.seek(50000000) ## s = fd3.readline() ## s = fd3.readline() i = 0 while True: i += 1 if i % 10000 == 0: print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3, '|', bool_read1, bool_read2, bool_read3, '|', bool_EOF1, bool_EOF2, bool_EOF3, i/10000 #### if CHROM1 != None and CHROM2 != None and CHROM3 != None: #### if abs(l_chromosomes.index(CHROM1)-l_chromosomes.index(CHROM2)) > 1: #### print CHROM1, CHROM2 #### stop1 #### if abs(l_chromosomes.index(CHROM2)-l_chromosomes.index(CHROM3)) > 1: #### print CHROM2, CHROM3 #### stop2 ## if bool_read2 == True and count2-(count_intersect12+count_intersect23+count_intersect123) > 0: ## print ## print 2, fp2 ## print CHROM2, '***', POS2, '***' ## print count2-(count_intersect12+count_intersect23+count_intersect123) ## print count2, count_intersect12, count_intersect23, count_intersect123 ## print fp1, chromosome1 ## print fp2, chromosome2 ## stop2 #### if bool_read1 == True and count1-(count_intersect12+count_intersect23+count_intersect123) > 0: #### print #### print 1, fp1 #### print CHROM1, '***', POS1, '***' #### print count1-(count_intersect12+count_intersect13+count_intersect123) #### print count1, count_intersect12, count_intersect13, count_intersect123 #### print fp1, chromosome1 #### print fp2, chromosome2 #### stop1 if bool_read1 == True: if bool_combined1 == True: CHROM1, POS1, count1, bool_EOF1 = loop_single_vcf(fd1,count1,) else: ( CHROM1, POS1, count1, fd1, bool_EOF1, chromosome1, ) = loop_multiple_vcf( fd1,count1,fp1_template,chromosome1,l_chromosomes, bool_ignore_FILTER=bool_ignore_FILTER1, ) bool_read1 = False if bool_read2 == True: if bool_combined2 == True: CHROM2, POS2, count2, bool_EOF2 = loop_single_vcf(fd2,count2,) else: ( CHROM2, POS2, count2, fd2, bool_EOF2, chromosome2, ) = loop_multiple_vcf( fd2,count2,fp2_template,chromosome2,l_chromosomes, ) bool_read2 = False if bool_read3 == True: line3 = fd3.readline() if line3 == '': bool_EOF3 = True CHROM3 = None POS3 = None else: l3 = line3.split() CHROM3 = l3[0] POS3 = int(l3[3]) count3 += 1 ## tmp AF line3b = fd3b.readline() l = line3b.split()[4:] AF = l.count(l[0])/184. if AF > 0.5: AF = 1-AF l_AF3 += [AF] bool_read3 = False if bool_EOF1 == True and bool_EOF2 == True and bool_EOF3 == True: break ## print POS1, POS2, POS3, CHROM1, CHROM2, CHROM3 ## stop ## doing nested if statements is the fastest method of comparison ## looping over lines simultaneously to avoid reading all markers into memory ## ## triple intersection ## ## if POS1 == POS2 == POS3 and CHROM1 == CHROM2 == CHROM3: if POS1 == POS2 == POS3: if POS1 % 1000 == 0: print CHROM1, '%6i' %(POS1/1000), '|', print '%8i' %(count_intersect12), '%8i' %(count_intersect13), '%8i' %(count_intersect23), '|', print '%8i' %(count_intersect123), '|', print '%8i' %(count1), '%8i' %(count2), '%8i' %(count3) ## count_intersection12 += 1 ## count_intersection13 += 1 ## count_intersection23 += 1 count_intersect123 += 1 l_AF123 += [AF] bool_read1 = True bool_read2 = True bool_read3 = True ## ## double intersection ## else: ## it is faster to do nesting of logical statements ## when comparing long integers if POS1 == POS2 != None: if bool_EOF3 == True: count_intersect12 += 1 bool_read1 = True bool_read2 = True elif CHROM1 == CHROM3 and POS1 < POS3: count_intersect12 += 1 bool_read1 = True bool_read2 = True elif CHROM1 == CHROM3: bool_read3 = True else: if l_chromosomes.index(CHROM3) < l_chromosomes.index(CHROM1): bool_read3 = True else: count_intersect12 += 1 bool_read1 = True bool_read2 = True elif POS1 == POS3 != None: if bool_EOF2 == True: count_intersect13 += 1 l_AF13 += [AF] bool_read1 = True bool_read3 = True elif CHROM1 == CHROM2 and POS1 < POS2: count_intersect13 += 1 l_AF13 += [AF] bool_read1 = True bool_read3 = True elif CHROM1 == CHROM2: bool_read2 = True else: print CHROM1, CHROM2 stop2 elif POS2 == POS3 != None: if bool_EOF1 == True: count_intersect23 += 1 l_AF23 += [AF] bool_read2 = True bool_read3 = True elif CHROM1 == CHROM2 and POS2 < POS1: count_intersect23 += 1 l_AF23 += [AF] bool_read2 = True bool_read3 = True elif CHROM1 == CHROM2: bool_read1 = True else: if l_chromosomes.index(CHROM1) < l_chromosomes.index(CHROM2): bool_read1 = True else: count_intersect23 += 1 bool_read2 = True bool_read3 = True stop3tmp_wegethereornot ## ## no intersection ## else: ## different chromosomes if ( (CHROM1 != CHROM2) or (bool_EOF3 == False and CHROM2 != CHROM3) ): l_indexes= [] if bool_EOF1 == False: index1 = l_chromosomes.index(CHROM1) l_indexes += [index1] if bool_EOF2 == False: index2 = l_chromosomes.index(CHROM2) l_indexes += [index2] if bool_EOF3 == False: index3 = l_chromosomes.index(CHROM3) l_indexes += [index3] min_index = min(l_indexes) if bool_EOF1 == False and index1 == min_index: bool_read1 = True if bool_EOF2 == False and index2 == min_index: bool_read2 = True if bool_EOF3 == False and index3 == min_index: bool_read3 = True ## same chromosome else: ## either read 1 or 3 if CHROM1 == CHROM2 and POS1 < POS2: if bool_EOF3 == True or POS1 < POS3: bool_read1 = True else: bool_read3 = True elif bool_EOF3 == True: bool_read2 = True ## either read 2 or 3 elif CHROM2 == CHROM3: if bool_EOF3 == True or POS2 < POS3: bool_read2 = True else: bool_read3 = True else: print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3 stop print count1 print count2 print count3 print count_intersect12 print count_intersect13 print count_intersect23 print count_intersect123 print print count1-count_intersect12-count_intersect13-count_intersect123 print count2-count_intersect12-count_intersect23-count_intersect123 print count3-count_intersect13-count_intersect23-count_intersect123 print print fp1 print fp2 print fp3 print 'AF3', sum(l_AF3)/len(l_AF3) print 'AF13', sum(l_AF13)/len(l_AF13) print 'AF23', sum(l_AF23)/len(l_AF23) print 'AF123', sum(l_AF123)/len(l_AF123) gnuplot.histogram2( 'AF3',title='MAF distribution - 2.5M chip array',l_data=l_AF3, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF13',title='MAF distribution - 2.5M chip array and HGI SNPs',l_data=l_AF13, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF23',title='MAF distribution - 2.5M chip array and GATK SNPs',l_data=l_AF23, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) gnuplot.histogram2( 'AF123',title='MAF distribution - 2.5M chip array and HGI and GATK SNPs',l_data=l_AF123, x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',) l_AF3 = [str(f) for f in l_AF3] l_AF13 = [str(f) for f in l_AF13] l_AF23 = [str(f) for f in l_AF23] l_AF123 = [str(f) for f in l_AF123] fd = open('AF3.txt','w') fd.write('\n'.join(l_AF3)) fd.close() fd = open('AF13.txt','w') fd.write('\n'.join(l_AF13)) fd.close() fd = open('AF23.txt','w') fd.write('\n'.join(l_AF23)) fd.close() fd = open('AF123.txt','w') fd.write('\n'.join(l_AF123)) fd.close() gnuplot.venn3( i1 = count1-count_intersect12-count_intersect13-count_intersect123, i2 = count2-count_intersect12-count_intersect23-count_intersect123, i3 = count3-count_intersect13-count_intersect23-count_intersect123, i4 = count_intersect12, i5 = count_intersect13, i6 = count_intersect23, i7 = count_intersect123, text1 = '%s' %(fp1), text2 = '%s' %(fp2), text3 = '%s' %(fp3), suffix = suffix, ) return