Exemplo n.º 1
0
def overallCaptureFraction(dirinput, fileList, probes, region='intron'):
    print('Reading overall captured base changes...')
    from parseline import VCFObj
    from collections import defaultdict
    from Bio.Seq import Seq

    vafs = {'C>A': [], 'C>G': [], 'C>T': [], 'T>A': [], 'T>C': [], 'T>G': []}

    for i in fileList:
        temp = {'C>A': 0, 'C>G': 0, 'C>T': 0, 'T>A': 0, 'T>C': 0, 'T>G': 0}
        target = open(dirinput + '/' + i, 'r')

        for line in target:
            if '#' not in line and 'chr' in line:  # skip the info
                vcfObj = VCFObj(line)

                # only use single substitutions and eliminate SNPs
                if len(vcfObj.wt) == 1 and len(
                        vcfObj.var) == 1 and vcfObj.af < 0.1:
                    varType = ('%s>%s' % (vcfObj.wt, vcfObj.var))

                    if varType not in vafs:
                        varType = ('%s>%s' %
                                   (str(Seq(vcfObj.wt).complement()),
                                    str(Seq(vcfObj.var).complement())))

                    for i in probes:
                        for j in probes[i]:
                            if j in str(vcfObj.location):
                                if region == 'intron' and i[0] == 'T':
                                    # add AO together
                                    temp[varType] += int(vcfObj.ao)
                                elif region == 'exon' and not i[0] == 'T':
                                    # add AO together
                                    temp[varType] += int(vcfObj.ao)

        target.close()

        # calculate fractions
        totalAO = 0
        for i in temp:
            totalAO += temp[i]

        for i in temp:
            temp[i] = float(temp[i]) / float(totalAO)

        # transfer data into vafs
        for i in temp:
            vafs[i].append(temp[i])

    return vafs
Exemplo n.º 2
0
def readVCF(inFiles, inDir, ref, probes):
    from parseline import VCFObj
    from Bio.Seq import Seq
    from getSequence import getRefSequence
    from collections import defaultdict
    
    totalAO = defaultdict(list)
    print 'TotExon','ExonCount','TotIntr','IntrCount','Exonic','Intronic','NormExon','NormIntron'
    
    for sample in inFiles:
        totalExon = 0
        totalIntron = 0
        exonCount = 0
        intronCount = 0

        target = open(inDir + '/' + sample + '/onlyProbedRegions.vcf', 'r')
        for line in target:
            if '#' not in line and 'chr' in line: # skip the info
                lobj = VCFObj(line)

                # only C or G could be in CpG context
                if (lobj.wt == 'C' or lobj.wt == 'G') and len(lobj.var) == 1 and lobj.af < 0.1:
                    seq = getRefSequence(line, 1, ref)

                    if seq[1:3] == 'CG' or seq[1:3] == 'GC':
                        for i in probes:
                            for loc in probes[i]:
                                if str(loc) in str(lobj.location) and i[0] == 'T':
                                    exonCount += 1
                                    totalExon += lobj.ao
                                elif str(loc) in str(lobj.location) and not i[0] == 'T':
                                    intronCount += 1
                                    totalIntron += lobj.ao

        # first normalize by total number of sites in each category
        exonic = totalExon / exonCount
        intronic = totalIntron / intronCount

        # normalize to 100 percent
        normexonic = exonic / (exonic + intronic)
        normintronic = intronic / (exonic + intronic)
        print totalExon, exonCount, totalIntron, intronCount, exonic, intronic, normexonic, normintronic
        
        # append normalized results
        totalAO['Exonic'].append(normexonic)
        totalAO['Intronic'].append(normintronic)
        
        target.close()

    print totalAO
    return totalAO
Exemplo n.º 3
0
def parseAll(inFiles, inDir, ref, probes):
    from parseline import VCFObj
    from Bio.Seq import Seq
    from getSequence import getRefSequence
    from collections import defaultdict
    
    # allbases = {'C>A':{'intron':[0.4,0.5], 'exon':[0.6,0.5]}}
    indiv = defaultdict(list)
    allbases = {'C>A':indiv.copy(),'C>G':indiv.copy(),'C>T':indiv.copy(),'T>A':indiv.copy(),'T>C':indiv.copy(),'T>G':indiv.copy()}
    
    for sample in inFiles:
        exon = {'C>A':[0,0],'C>G':[0,0],'C>T':[0,0],'T>A':[0,0],'T>C':[0,0],'T>G':[0,0]}
        intron = {'C>A':[0,0],'C>G':[0,0],'C>T':[0,0],'T>A':[0,0],'T>C':[0,0],'T>G':[0,0]}

        target = open(inDir + '/' + sample + '/onlyProbedRegions.vcf', 'r')
        for line in target:
            if '#' not in line and 'chr' in line: # skip the info
                lobj = VCFObj(line)

                # only look at substitutions and non SNPs
                if len(lobj.wt) == 1 and len(lobj.var) == 1 and lobj.af < 0.1:
                    label = ('%s>%s' % (lobj.wt, lobj.var))
                    if label not in exon:
                        wt = str(Seq(lobj.wt).complement())
                        var = str(Seq(lobj.var).complement())
                        label = ('%s>%s' % (wt, var))

                    for i in probes:
                        for loc in probes[i]:
                            if str(loc) in str(lobj.location) and i[0] == 'T':
                                intron[label][0] += lobj.ao
                                intron[label][1] += 1
                            elif str(loc) in str(lobj.location) and not i[0] == 'T':
                                exon[label][0] += lobj.ao
                                exon[label][1] += 1

        # first normalize by total number of sites in each category
        for i in exon:
            exon[label][0] = exon[label][0] / exon[label][1]
            intron[label][0] = intron[label][0] / intron[label][1]

        # normalize to 100 percent
        for i in exon:
            normexonic = exon[i][0] / (exon[i][0] + intron[i][0])
            normintronic = intron[i][0] / (exon[i][0] + intron[i][0])
            allbases[i]['exon'].append(normexonic)
            allbases[i]['intron'].append(normintronic)
        
        target.close()

    return allbases
Exemplo n.º 4
0
def populatePandasDataframe(dirinput, fileList, probes, ref, upstream=10, downstream=10):
    import pandas as pd
    from Bio.Seq import Seq
    print('Building data structure...')

    allSamples = []
    columns = ['Loc','WT','Var','Change','ConvChange','AO','DP','VAF','IntEx','Upstream','Downstream','Individual']
    dat = []

    tempAllVariants = []
    sampleCount = 0
    for sample in fileList:
        inFile = open(dirinput + '/' + sample + '/onlyProbedRegions.vcf', 'r')
        sampleCount += 1

        for line in inFile:
            if '#' not in line and 'chr' in line: # skip the info
                lineobj = VCFObj(line)
                # convert to six changes
                if lineobj.wt == 'G' or lineobj.wt == 'A':
                    wt = str(Seq(lineobj.wt).complement())
                    var = str(Seq(lineobj.var).complement())
                else:
                    wt = str(lineobj.wt)
                    var = str(lineobj.var)

                surrounding = getRefSequence(lineobj, upstream, downstream, ref)
                up = str(surrounding[:upstream])
                down = str(surrounding[-downstream:])

                probeRegion = ''
                for probe in probes:
                    if len(probeRegion) < 1:
                        for loc in probes[probe]:
                            if str(loc) in str(lineobj.location):
                                if probe[0] == 'T':
                                    probeRegion = 'TIII'
                                else:
                                    probeRegion = 'Exon'


                if len(lineobj.wt) == 1 and len(lineobj.var) == 1 and lineobj.af < 0.1:
                    dat = [lineobj.location, str(lineobj.wt), str(lineobj.var), str(lineobj.wt) + '>' + str(lineobj.var), wt + '>' + var, lineobj.ao, lineobj.dp, lineobj.af, probeRegion, up, down, sampleCount]
                    tempdat = pd.DataFrame(dat, index=columns)
                    tempAllVariants.append(tempdat.T)

        inFile.close()
    allVariants = pd.concat(tempAllVariants, ignore_index=True)

    return allVariants
Exemplo n.º 5
0
def parseAllVAF(inFiles, inDir, ref, probes):
    from parseline import VCFObj
    from Bio.Seq import Seq
    from getSequence import getRefSequence
    from collections import defaultdict
    import numpy as np
    
    # intron = {('CCG','C>A'):[0.75, 0.87]}
    # exon = {('CCG','C>A'):[0.75, 0.87]}
    intron = defaultdict(list)
    exon = defaultdict(list)

    for sample in inFiles:
        tempintron = defaultdict(list)
        tempexon = defaultdict(list)

        target = open(inDir + '/' + sample + '/onlyProbedRegions.vcf', 'r')
        for line in target:
            if '#' not in line and 'chr' in line: # skip the info
                lobj = VCFObj(line)

                # only look at substitutions and non SNPs
                if len(lobj.wt) == 1 and len(lobj.var) == 1 and lobj.af < 0.1:
                    if lobj.wt == 'C' or lobj.wt == 'T':
                        label = ('%s>%s' % (lobj.wt, lobj.var))
                        seq = getRefSequence(line, 1, ref)
                    else:
                        wt = str(Seq(lobj.wt).complement())
                        var = str(Seq(lobj.var).complement())
                        label = ('%s>%s' % (wt, var))
                        seq = str(Seq(getRefSequence(line, 1, ref)).reverse_complement())

                    for i in probes:
                        for loc in probes[i]:
                            if str(loc) in str(lobj.location) and i[0] == 'T':
                                tempintron[(seq,label)].append(lobj.af)
                            elif str(loc) in str(lobj.location) and not i[0] == 'T':
                                tempexon[(seq,label)].append(lobj.af)

        for i in tempintron:
            vafmean = np.mean(tempintron[i])
            intron[i].append(vafmean)
        for i in tempexon:
            vafmean = np.mean(tempexon[i])
            exon[i].append(vafmean)

        target.close()

    return intron, exon
Exemplo n.º 6
0
def getSequence(vcfLine, flankLength):
    from parseline import VCFObj
    from subprocess import check_output, STDOUT

    vcfObj = VCFObj(vcfLine)

    low = int(vcfObj.location) - flankLength
    high = int(vcfObj.location) + flankLength
    temp = check_output(
        'wget -qO- http://genome.ucsc.edu/cgi-bin/das/hg19/dna?segment=%s:%s,%s'
        % (vcfObj.chrom, low, high),
        stderr=STDOUT,
        shell=True)

    finalSeq = ''
    for line in temp.split('\n'):
        if '<' not in line:
            finalSeq += line

    return finalSeq
Exemplo n.º 7
0
def getRefSequence(vcfLine, flankLength, ref):
    from parseline import VCFObj
    from subprocess import check_output, STDOUT
    from string import upper

    vcfObj = VCFObj(vcfLine)

    low = int(vcfObj.location) - flankLength
    high = int(vcfObj.location) + flankLength
    temp = check_output('samtools faidx %s %s:%s-%s' %
                        (ref, vcfObj.chrom, low, high),
                        stderr=STDOUT,
                        shell=True)

    finalSeq = ''
    for line in temp.split('\n'):
        if '>' not in line:
            finalSeq += line

    finalSeq = finalSeq.upper()
    return finalSeq