Exemplo n.º 1
0
sys.path.append('/nfs/users/nfs_t/tc9/github/tc9/misc')
import gnuplot


bim4 = 'omni2.5-4_20120904_agv_gtu.bim'
bim8 = 'omni2.5-8_agv_20120910_gtu.bim'

strand8 = 'HumanOmni2.5-8v1_A-b37.strand'
strand437 = 'HumanOmni2.5M-b37-v2.strand'
strand436 = 'HumanOmni2.5M-b37-v2.strand'


gnuplot.venn3(
    i1=1,i2=1,i3=1,i4=1,i5=1,i6=1,i7=1,
    text1='%s %i' %(strand436,2449906,),
    text2='%s %i' %(strand437,2449626,),
    text3='%s %i' %(strand8,2379514,),
    sum1 = 280, sum2 = 0, sum3 = 2376802,
    sum4 = 
    )

    
d_count = {}
for fn in (bim4,bim8,strand436,strand437,strand8):
    cmd = 'cat %s | wc -l' %(fn)
    i = int(os.popen(cmd).read())
    d_count[fn] = i

## show that strand files are subsets of bim files
for fn1,fn2 in [
    [strand436,bim4,],
    [strand437,bim4,],
Exemplo n.º 2
0
def count_unique_and_intersect_vqsr(
    fp_vcf_individual,fp_vcf_combined,fp_map,
    suffix,
    bool_combined1 = False,
    bool_combined2 = True,
    bool_ignore_FILTER1 = False,
    ):

    '''this function assumes that markers in the VCFs are sorted'''

    ## set file paths
    fp2_template = fp2 = fp_vcf_combined
    fp1_template = fp1 = fp_vcf_individual
    fp3 = fp_map

    print fp1
    print fp2
    print fp3

    ## set list of chromosomes (genotype array only contains autosomal SNPs)
    l_chromosomes = [str(i) for i in range(1,22+1,)]+['X','Y',]

    ## set initial chromosome
    chromosome1 = l_chromosomes[0]
    chromosome2 = l_chromosomes[0]
##    chromosome1 = chromosome2 = '22'

    fp1 = fp1.replace('$CHROMOSOME',chromosome1,)
    fp2 = fp2.replace('$CHROMOSOME',chromosome2,)

    ## set booleans before loop
    bool_read1 = True
    bool_read2 = True
    bool_read3 = True
    bool_EOF1 = False
    bool_EOF2 = False
    bool_EOF3 = False

    ## set counters before loop
    count_intersect12 = 0
    count_intersect13 = 0
    count_intersect23 = 0
    count1 = 0
    count2 = 0
    count3 = 0
    count_intersect123 = 0

    l_AF13 = []
    l_AF23 = []
    l_AF3 = []
    l_AF123 = []

    ## open files before loop
    fd1 = open(fp1,'r')
    fd2 = open(fp2,'r')
    fd3 = open(fp3,'r')
    fd3b = open('../omni2.5-8_20120516_gwa_ugand_gtu_autosomes_postsampqc_postsnpqc_flipped.tped','r')
    
####    fd1.seek(1500000000000)
####    s = fd1.readline()
####    s = fd1.readline()
##    fd2.seek(65000000000)
##    s = fd2.readline()
##    s = fd2.readline()
##    fd3.seek(50000000)
##    s = fd3.readline()
##    s = fd3.readline()

    i = 0

    while True:

        i += 1

        if i % 10000 == 0:
            print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3, '|', bool_read1, bool_read2, bool_read3, '|', bool_EOF1, bool_EOF2, bool_EOF3, i/10000
####            if CHROM1 != None and CHROM2 != None and CHROM3 != None:
####                if abs(l_chromosomes.index(CHROM1)-l_chromosomes.index(CHROM2)) > 1:
####                    print CHROM1, CHROM2
####                    stop1
####                if abs(l_chromosomes.index(CHROM2)-l_chromosomes.index(CHROM3)) > 1:
####                    print CHROM2, CHROM3
####                    stop2
##        if bool_read2 == True and count2-(count_intersect12+count_intersect23+count_intersect123) > 0:
##            print
##            print 2, fp2
##            print CHROM2, '***', POS2, '***'
##            print count2-(count_intersect12+count_intersect23+count_intersect123)
##            print count2, count_intersect12, count_intersect23, count_intersect123
##            print fp1, chromosome1
##            print fp2, chromosome2
##            stop2
####        if bool_read1 == True and count1-(count_intersect12+count_intersect23+count_intersect123) > 0:
####            print
####            print 1, fp1
####            print CHROM1, '***', POS1, '***'
####            print count1-(count_intersect12+count_intersect13+count_intersect123)
####            print count1, count_intersect12, count_intersect13, count_intersect123
####            print fp1, chromosome1
####            print fp2, chromosome2
####            stop1

        if bool_read1 == True:
            if bool_combined1 == True:
                CHROM1, POS1, count1, bool_EOF1 = loop_single_vcf(fd1,count1,)
            else:
                (
                    CHROM1, POS1, count1, fd1, bool_EOF1,
                    chromosome1,
                    ) = loop_multiple_vcf(
                        fd1,count1,fp1_template,chromosome1,l_chromosomes,
                        bool_ignore_FILTER=bool_ignore_FILTER1,
                        )
            bool_read1 = False

        if bool_read2 == True:
            if bool_combined2 == True:
                CHROM2, POS2, count2, bool_EOF2 = loop_single_vcf(fd2,count2,)
            else:
                (
                    CHROM2, POS2, count2, fd2, bool_EOF2,
                    chromosome2,
                    ) = loop_multiple_vcf(
                        fd2,count2,fp2_template,chromosome2,l_chromosomes,
                        )
            bool_read2 = False

        if bool_read3 == True:
            line3 = fd3.readline()
            if line3 == '':
                bool_EOF3 = True
                CHROM3 = None
                POS3 = None
            else:
                l3 = line3.split()
                CHROM3 = l3[0]
                POS3 = int(l3[3])
                count3 += 1

                ## tmp AF
                line3b = fd3b.readline()
                l = line3b.split()[4:]
                AF = l.count(l[0])/184.
                if AF > 0.5:
                    AF = 1-AF
                l_AF3 += [AF]

            bool_read3 = False


        if bool_EOF1 == True and bool_EOF2 == True and bool_EOF3 == True:
            break

##        print POS1, POS2, POS3, CHROM1, CHROM2, CHROM3
##        stop

        ## doing nested if statements is the fastest method of comparison
        ## looping over lines simultaneously to avoid reading all markers into memory

        ##
        ## triple intersection
        ##
##        if POS1 == POS2 == POS3 and CHROM1 == CHROM2 == CHROM3:
        if POS1 == POS2 == POS3:
            if POS1 % 1000 == 0:
                print CHROM1, '%6i' %(POS1/1000), '|',
                print '%8i' %(count_intersect12), '%8i' %(count_intersect13), '%8i' %(count_intersect23), '|',
                print '%8i' %(count_intersect123), '|',
                print '%8i' %(count1), '%8i' %(count2), '%8i' %(count3)
##            count_intersection12 += 1
##            count_intersection13 += 1
##            count_intersection23 += 1
            count_intersect123 += 1
            l_AF123 += [AF]
            bool_read1 = True
            bool_read2 = True
            bool_read3 = True
        ##
        ## double intersection
        ##
        else:
            ## it is faster to do nesting of logical statements
            ## when comparing long integers
            if POS1 == POS2 != None:
                if bool_EOF3 == True:
                    count_intersect12 += 1
                    bool_read1 = True
                    bool_read2 = True
                elif CHROM1 == CHROM3 and POS1 < POS3:
                    count_intersect12 += 1
                    bool_read1 = True
                    bool_read2 = True
                elif CHROM1 == CHROM3:
                    bool_read3 = True
                else:
                    if l_chromosomes.index(CHROM3) < l_chromosomes.index(CHROM1):
                        bool_read3 = True
                    else:
                        count_intersect12 += 1
                        bool_read1 = True
                        bool_read2 = True
            elif POS1 == POS3 != None:
                if bool_EOF2 == True:
                    count_intersect13 += 1
                    l_AF13 += [AF]
                    bool_read1 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2 and POS1 < POS2:
                    count_intersect13 += 1
                    l_AF13 += [AF]
                    bool_read1 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2:
                    bool_read2 = True
                else:
                    print CHROM1, CHROM2
                    stop2
            elif POS2 == POS3 != None:
                if bool_EOF1 == True:
                    count_intersect23 += 1
                    l_AF23 += [AF]
                    bool_read2 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2 and POS2 < POS1:
                    count_intersect23 += 1
                    l_AF23 += [AF]
                    bool_read2 = True
                    bool_read3 = True
                elif CHROM1 == CHROM2:
                    bool_read1 = True
                else:
                    if l_chromosomes.index(CHROM1) < l_chromosomes.index(CHROM2):
                        bool_read1 = True
                    else:
                        count_intersect23 += 1
                        bool_read2 = True
                        bool_read3 = True
                        stop3tmp_wegethereornot
            ##
            ## no intersection
            ##
            else:
                ## different chromosomes
                if (
                    (CHROM1 != CHROM2)
                    or
                    (bool_EOF3 == False and CHROM2 != CHROM3)
                    ):
                    l_indexes= []
                    if bool_EOF1 == False:
                        index1 = l_chromosomes.index(CHROM1)
                        l_indexes += [index1]
                    if bool_EOF2 == False:
                        index2 = l_chromosomes.index(CHROM2)
                        l_indexes += [index2]
                    if bool_EOF3 == False:
                        index3 = l_chromosomes.index(CHROM3)
                        l_indexes += [index3]
                    min_index = min(l_indexes)
                    if bool_EOF1 == False and index1 == min_index:
                        bool_read1 = True
                    if bool_EOF2 == False and index2 == min_index:
                        bool_read2 = True
                    if bool_EOF3 == False and index3 == min_index:
                        bool_read3 = True
                ## same chromosome
                else:
                    ## either read 1 or 3
                    if CHROM1 == CHROM2 and POS1 < POS2:
                        if bool_EOF3 == True or POS1 < POS3:
                            bool_read1 = True
                        else:
                            bool_read3 = True
                    elif bool_EOF3 == True:
                        bool_read2 = True
                    ## either read 2 or 3
                    elif CHROM2 == CHROM3:
                        if bool_EOF3 == True or POS2 < POS3:
                            bool_read2 = True
                        else:
                            bool_read3 = True
                    else:
                        print CHROM1, CHROM2, CHROM3, POS1, POS2, POS3
                        stop

    print count1
    print count2
    print count3
    print count_intersect12
    print count_intersect13
    print count_intersect23
    print count_intersect123
    print
    print count1-count_intersect12-count_intersect13-count_intersect123
    print count2-count_intersect12-count_intersect23-count_intersect123
    print count3-count_intersect13-count_intersect23-count_intersect123
    print
    print fp1
    print fp2
    print fp3

    print 'AF3', sum(l_AF3)/len(l_AF3)
    print 'AF13', sum(l_AF13)/len(l_AF13)
    print 'AF23', sum(l_AF23)/len(l_AF23)
    print 'AF123', sum(l_AF123)/len(l_AF123)

    gnuplot.histogram2(
        'AF3',title='MAF distribution - 2.5M chip array',l_data=l_AF3,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF13',title='MAF distribution - 2.5M chip array and HGI SNPs',l_data=l_AF13,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF23',title='MAF distribution - 2.5M chip array and GATK SNPs',l_data=l_AF23,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)
    gnuplot.histogram2(
        'AF123',title='MAF distribution - 2.5M chip array and HGI and GATK SNPs',l_data=l_AF123,
        x_min=0,x_max=0.5,x_step=0.01,tic_step=0.05,xlabel='MAF',ylabel='SNP count',)

    l_AF3 = [str(f) for f in l_AF3]
    l_AF13 = [str(f) for f in l_AF13]
    l_AF23 = [str(f) for f in l_AF23]
    l_AF123 = [str(f) for f in l_AF123]

    fd = open('AF3.txt','w')
    fd.write('\n'.join(l_AF3))
    fd.close()
    fd = open('AF13.txt','w')
    fd.write('\n'.join(l_AF13))
    fd.close()
    fd = open('AF23.txt','w')
    fd.write('\n'.join(l_AF23))
    fd.close()
    fd = open('AF123.txt','w')
    fd.write('\n'.join(l_AF123))
    fd.close()

    gnuplot.venn3(
        i1 = count1-count_intersect12-count_intersect13-count_intersect123,
        i2 = count2-count_intersect12-count_intersect23-count_intersect123,
        i3 = count3-count_intersect13-count_intersect23-count_intersect123,
        i4 = count_intersect12,
        i5 = count_intersect13,
        i6 = count_intersect23,
        i7 = count_intersect123,
        text1 = '%s' %(fp1),
        text2 = '%s' %(fp2),
        text3 = '%s' %(fp3),
        suffix = suffix,
        )

    return