Exemplo n.º 1
0
def readVCF( filename, chrom ):

    vcf = []
    callablelist = []
    print >>sys.stderr, "#### Loading vcf for choromosome",chrom
    for line in io.BufferedReader( filez.open(filename,'r') ):
        if line.startswith('#'): continue
        if not line.startswith( chrom + "\t"): continue
        elts = line[:-1].split('\t')
        chrom, pos, id, ref, alt, qual, filt, info = elts[:8]
        start, end, callable, hr = -1, -1, False, 1
        for e in info.split(';'):
            if e.startswith('START'): start = int(e.split('=')[1])
            if e.startswith('END'): end = int(e.split('=')[1])
            if e.startswith('CALLABLE'): callable = True
            if e.startswith('HR') or e.startswith('TR'): hr = max(hr,int(e.split('=')[1]))
        if callable:
            if len(callablelist) >0 and callablelist[-1]+1 == start:
                callablelist[-1] = end
            else:
                callablelist.append( start )
                callablelist.append( end )
        else:
            vcf.append( Indel(int(pos),ref,alt,start,end,hr) )
    print>>sys.stderr,  "#### Loaded ",len(vcf)," calls"
    vcf.sort()
    return vcf, callablelist
Exemplo n.º 2
0
def readmark( filename ):
    data = {}
    for line in filez.open(filename,'r'):
        chrom, pos = line[:-1].split('\t')
        if chrom not in data: data[chrom] = []
        data[chrom].append(int(pos) - 1)   # convert into 0-based coords
    for chrom in data.keys():
        data[chrom] = set(data[chrom])
    return data
Exemplo n.º 3
0
def main():
    infile = sys.stdin
    outfile = sys.stdout
    inversion = 40
    fastGT = False
    v = vcf.VCF(leftalign=True, _fastGT=True)
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:3F", ["help", "output", "input", "ignore"])
    except:
        help()
    for o, a in opts:
        if o in ["-h", "--help"]:
            help()
            sys.exit()
        elif o in ["-o", "--output"]:
            outfile = open(a, "w")
        elif o in ["-i", "--input"]:
            infile = filez.open(a, "r")
        elif o == "-3":
            inversion = 33
        elif o == "-F":
            fastGT = True
        elif o in ["-x", "--ignore"]:
            v.ignoreerror(a)

    # process data
    v.setversion(inversion)
    v._fastGT = fastGT
    vcfstream = v.parse(infile)

    # instantiate vcfout from v, to include header and definitions.
    vcfout = vcf.VCF(v)
    vcfout.setversion(40)
    vcfout.getinfo()["AC"] = vcf.FORMAT("AC", vcfout.NT_ALLELES, -1, "Integer", "Allele counts", -1)

    vcfout.writeheader(outfile)

    for data in vcfstream:
        samples = v.getsamples()
        ac = {}
        for s in samples:
            sampledata = data[s]
            genotype = sampledata.get("GT", ["."])[0]
            if len(genotype) == 1:
                continue  # haploid, or missing data
            for idx in 0, 2:
                if genotype[idx] == ".":
                    continue
                ac[genotype[idx]] = ac.get(genotype[idx], 0) + 1
            if len(ac.keys()) > 0:
                data["info"]["AC"] = [ac.get(i, 0) for i in range(len(data["alt"]) + 1)]
        vcfout.write_data(outfile, data)
Exemplo n.º 4
0
def processVCF( invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4, window=50 ):

    invcf = filez.open(invcf,'r')
    curchrom = None
    for line in invcf:
        # copy header
        if line.startswith('#'):
            # remove any existing AA definition
            if line.startswith('##INFO=<ID=AA'):
                continue
            if removeHR:
                if line.startswith('##INFO=<ID=HR'): continue
                if line.startswith('##INFO=<ID=HU'): continue
                if line.startswith('##INFO=<ID=TR'): continue
                if line.startswith('##INFO=<ID=TU'): continue
            if line.startswith('#CHROM'):
                ## add info fields
                if reportIdx:
                    print '##INFO=<ID=AA, Number=1, Type=Integer, Description="Ancestral Allele (0=reference, 1=first alternative allele, etc.)">'
                else:
                    print '##INFO=<ID=AA, Number=1, Type=String, Description="Ancestral Allele">'                    
                if includeSZ:
                    print '##INFO=<ID=SZ, Number=1, Type=Integer, Description="Indel size and type (polarized if AA=0,1; with respect to reference if AA=.)">'
                if findHotspot:
                    print "##INFO=<ID=OTHANC, Number=0, Type=Flag, Description=""Found ancestral alleles other than reference and variant; potential hotspot"">"
                    print "##INFO=<ID=INCANC, Number=0, Type=Flag, Description=""Found ancestral alleles inconsistent with mutation just on human lineage"">"
                #print "##source=vcf-indel-polarize.py %s %s %s %s %s" % (invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4)
            print line[:-1]
            continue
        # get data
        elts = line[:-1].split('\t')
        chrom, pos, id, ref, altlist, qual, filt, info = (elts + [""])[:8]
        pos = int(pos)
        chrom = chromprefix + chrom
        hr = 1
        aafieldlen = 0
        newinfo = []
        for e in info.split(';'):
            if e.startswith('HR') or e.startswith('TR'): hr = max(hr,int(e.split('=')[1]))
            if e.startswith('AA'): aafieldlen = len(e)
            if removeHR:
                if not (e.startswith('HR') or e.startswith('TR') or e.startswith('HU') or e.startswith('TU')):
                    newinfo.append( e )
        if removeHR:
            info = ';'.join(newinfo)

        # load ancestral chromosome calls if required
        if curchrom != chrom:
            vcf, callablelist = readVCF( ancvcf, chrom )
            if ancvcf2: vcf2, callablelist2 = readVCF( ancvcf2, chrom )
            if ancvcf3: vcf3, callablelist3 = readVCF( ancvcf3, chrom )
            if ancvcf4: vcf4, callablelist4 = readVCF( ancvcf4, chrom )
            curchrom = chrom

        # make sure no biallelics
        assert altlist.find(',') == -1

        # do not touch SNP calls; leave any AA call in (but remove HR/TR/TU/HU if requested)
        if (len(ref)==1 and len(altlist) == 1) and not snps:
            print "\t".join([chrom, str(pos), id, ref, altlist, qual, filt, info] + elts[8:])
            continue

        # all others, remove AA call
        if aafieldlen > 0:
            # remove any existing AA annotation
            idx = info.index('AA=')
            info = info[:idx] + info[idx + aafieldlen + 1:]

        # make list of alternative alleles, and loop over them
        polarized = False
        for idx,alt in enumerate(altlist.split(',')):
            alleleidx = idx+1
            ref, alt = convert(ref,alt)
            typesize = len(alt)-len(ref)
            if max(len(alt), len(ref)) > maxindellen:
                # long deletions (or insertions); we can't deal with them, so report as AA=.
                continue
            if alt == "":
                # <DEL> call -- report as AA=.
                continue

            # get calls in vicinity, find matching calls
            anccalls = getVCFcalls( vcf, pos-window, pos+window )
            found, other = findMatch( anccalls, pos, typesize, alt, hr )
            callable = isCallable( pos, callablelist )

            # do same for ancvcf2
            if ancvcf2:
                anccalls2 = getVCFcalls( vcf2, pos-window, pos+window )
                found2, other2 = findMatch( anccalls2, pos, typesize, alt, hr )
                callable2 = isCallable( pos, callablelist2 )
            else:
                found2 = found
                callable2 = False
                other2 = other

            # and ancvcf3
            if ancvcf3:
                anccalls3 = getVCFcalls( vcf3, pos-window, pos+window )
                found3, other3 = findMatch( anccalls3, pos, typesize, alt, hr )
                callable3 = isCallable( pos, callablelist3 )
            else:
                found3 = found
                callable3 = False
                other3 = other

            # and ancvcf4
            if ancvcf4:
                anccalls4 = getVCFcalls( vcf4, pos-window, pos+window )
                found4, other4 = findMatch( anccalls4, pos, typesize, alt, hr )
                callable4 = isCallable( pos, callablelist4 )
            else:
                found4 = found
                callable4 = False
                other4 = other

            numcallable = callable + callable2 + callable3 + callable4

            # get status from one of the called alleles
            if callable:
                found0, other0 = found, other
            elif callable2:
                found0, other0 = found2, other2
            elif callable3:
                found0, other0 = found3, other3
            elif callable4:
                found0, other0 = found4, other4
            else:
                found0, other0 = False, None

            # fill in the uncallable states with one of the others
            if not callable:
                found, other = found0, other0
            if not callable2:
                found2, other2 = found0, other0
            if not callable3:
                found3, other3 = found0, other0
            if not callable4:
                found4, other4 = found0, other0

            # if sufficiently many outgroups are callable, and found
            # status is consistent, and no other indels were found, polarize.

            if found:
                sz = -typesize
                if reportIdx:
                    aallele = alleleidx
                else:
                    aallele = alt
            else:
                sz = typesize
                if reportIdx:
                    aallele = 0
                else:
                    aallele = ref

            if numcallable >= mincallable and (found == found2) and (found == found3) and (found == found4) and \
                   (other == None) and (other2 == None) and (other3 == None) and (other4 == None):
                if includeSZ:
                    info = "AA=%s;SZ=%s;%s" % (aallele, sz, info)
                else:
                    info = "AA=%s;%s" % (aallele, info)
                # done with the loop over alleles
                polarized = True
                break

        if not polarized:
            info = "AA=.;%s" % info
            if numcallable >= mincallable and findHotspot:
                if (other != None) or (other2 != None) or (other3 != None) or (other4 != None):
                    info += ";OTHANC"    # other ancestral alleles than reference and variant -- hotspot!
                if (found != found2) or (found != found3) or (found != found4):
                    info += ";INCANC"    # ancestry inconsistent with mutation happening on human branch only -- hotspot, or incomplete lineage sorting

        # make output
        if info.endswith(";."): info = info[:-2]
        print "\t".join([chrom, str(pos), id, ref, altlist, qual, filt, info] + elts[8:])
Exemplo n.º 5
0
def main():
    infile = sys.stdin
    outfile = sys.stdout
    reference = None
    sample = "sample"
    maxsnps = 10
    maxindels = 5
    maxwindow = 1000
    v = vcf.VCF( _fastGT = False )
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:r:s:n:", ["help","output=","input=","reference=","sample="])
    except:
        help()
        raise
    for o, a in opts:
        if o in ["-h","--help"]:
            help()
            sys.exit()
        elif o in ["-o","--output"]:
            outfile = open(a,'w')
        elif o in ["-i","--input"]:
            infile = filez.open(a,'r')
        elif o in ["-r","--reference"]:
            reference = a
        elif o in ["-s"]:
            maxsnps = int(a)
        elif o in ["-n"]:
            maxindels = int(a)
        elif o in ["--sample"]:
            sample = a
            
    if not reference:
        print "Reference required"
        help()
        sys.exit()

    # open reference
    fa = pysam.Fastafile( reference )
    v.setreference(fa)
    
    # instantiate vcfout from v, to include header and definitions.
    vcfout = vcf.VCF(v)
    vcfout.setversion(41)

    vcfout.getheader().append( ("source",' '.join(sys.argv)))

    vcfout.getinfo()['RN'] = vcf.FORMAT('RN', vcfout.NT_NUMBER, 1, "String", "Name of read supporting the variant", -1)
    vcfout.getfilter()['HighCallDensity'] = vcf.FORMAT('mask',vcfout.NT_NUMBER,0,"Flag","Number of indels/snps exceeding %s/%s in %s bp window" % (maxindels,maxsnps,maxwindow),".")

    vcfout.setsamples( [sample] )

    vcfout.writeheader( outfile )

    for data in sys.stdin:
        
        if data.startswith("#"):
            continue

        readname, flag, chrom, pos, mapq, cigar, mchrom, mpos, isize, seq, qual = data[:-1].split('\t')[:11]

        vcflist = []
        vcfdata = {'chrom':chrom, 'id':'.', 'qual':100, 'filter': ["HighCallDensity"], 'info': {'RN': [readname]}, 'format': ['GT'], sample:{'GT':["1"]}}

        rpos = int(pos)-1
        spos = 0

        for i in re.finditer( "[0-9]*[MIDSNHP]", cigar ):
            ash = i.group(0)
            num = int( ash[:-1] )
            if ash[-1] == 'P' or ash[-1] == 'H':
                pass
            elif ash[-1] == 'M':
                refseq = fa.fetch(chrom,rpos,rpos+num)
                for idx,c in enumerate(refseq):
                    c = c.upper()
                    if c != 'N' and seq[spos+idx] != 'N' and c != seq[spos+idx]:
                        # found a SNP
                        vcfdata['pos'] = rpos + idx
                        vcfdata['ref'] = c
                        vcfdata['alt'] = [seq[spos+idx]]
                        vcflist.append( vcfdata.copy() )
                spos += num
                rpos += num
            elif ash[-1] == 'S':
                spos += num
            elif ash[-1] == 'I':
                vcfdata['pos'] = rpos - 1
                vcfdata['ref'] = fa.fetch(chrom, rpos-1, rpos)
                vcfdata['alt'] = [fa.fetch(chrom, rpos-1, rpos) + seq[spos:spos+num]]
                vcflist.append( vcfdata.copy() )
                spos += num
            elif ash[-1] == 'D':
                vcfdata['pos'] = rpos - 1
                vcfdata['ref'] = fa.fetch(chrom, rpos-1, rpos + num)
                vcfdata['alt'] = [fa.fetch(chrom, rpos-1, rpos)]
                vcflist.append( vcfdata.copy() )
                rpos += num

        # see if snp or indel density is too high
        windowidx = 0
        for idx,call in enumerate(vcflist):
            while vcflist[windowidx]['pos'] < call['pos'] - maxwindow:
                windowidx += 1
            indels = len( [ c for c in vcflist[windowidx:idx+1] if len(c['ref']+c['alt'][0])>2 ] )
            snps = len( [ c for c in vcflist[windowidx:idx+1] if len(c['ref']+c['alt'][0])==2 ] )
            if indels > maxindels or snps > maxsnps: 
                break
        else:
            for call in vcflist:
                call['filter'] = None

        for call in vcflist:
            vcfout.write_data( outfile, call )
Exemplo n.º 6
0
def main():
    infile = sys.stdin
    outfile = sys.stdout
    inversion = 40
    maxn = 1e10
    fastGT = False
    v = vcf.VCF( leftalign=True, _fastGT=True )
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:3Fm:", ["help","output","input","ignore"])
    except:
        help()
    for o, a in opts:
        if o in ["-h","--help"]:
            help()
            sys.exit()
        elif o in ["-o","--output"]:
            outfile = open(a,'w')
        elif o in ["-i","--input"]:
            infile = filez.open(a,'r')
        elif o == "-3":
            inversion = 33
        elif o == "-F":
            fastGT = True
        elif o == "-m":
            maxn = int(a)
        elif o in ["-x","--ignore"]:
            v.ignoreerror(a)
    
    # process data
    v.setversion(inversion)
    v._fastGT = fastGT
    vcfstream = v.parse( infile )

    # instantiate vcfout from v, to include header and definitions.
    vcfout = vcf.VCF(v)  
    vcfout.setversion(40)
    vcfout.getinfo()['LABELS'] = vcf.FORMAT('LABELS',vcfout.NT_UNKNOWN,-1,'String','Non-hom-ref samples','')

    vcfout.writeheader( outfile )

    for data in vcfstream:
        samples = v.getsamples()
        labels = {}
        for s in samples:
            sampledata = data[s]
            genotype = sampledata.get('GT',['.'])[0]
            if len(genotype) == 1: continue    # haploid, or missing data
            acref = 0
            for idx in 0,2:
                g = genotype[idx]
                if g == "." or g == 0: continue
                if g not in labels: labels[g] = set()
                labels[g].add( s )
        report = []
        for g in labels:
            if 0 < len( labels[g] ) <= maxn:
                report.append( "%s:%s" % (g,",".join(labels[g])) )
            else:
                report.append( "%s:%s" % (g,len(labels[g])) )
        data['info']['LABELS'] = report
        vcfout.write_data( outfile, data )
Exemplo n.º 7
0
def processVCF( invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4, window=50 ):

    invcf = filez.open(invcf,'r')
    curchrom = None
    for line in invcf:
        # copy header
        if line.startswith('#'):
            if line.startswith('#CHROM'):
                ## add info fields
                print "##INFO=<ID=AA, Number=1, Type=String, Description=""Ancestral Allele"">"
                print "##INFO=<ID=SZ, Number=1, Type=Integer, Description=""Indel size and type"">"
                print "##source=vcf-indel-polarize.py %s %s %s %s %s" % (invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4)
            print line[:-1]
            continue
        # get data
        elts = line[:-1].split('\t')
        chrom, pos, id, ref, alt, qual, filter, info = elts[:8]
        pos = int(pos)
        chrom = chromprefix + chrom
        ref, alt = convert(ref,alt)
        typesize = len(alt)-len(ref)
        if typesize != 0:
            # not a snp -- adjust coordinate
            pos -= vcf_offset
        hr = 1
        for e in info.split(';'):
            if e.startswith('HR') or e.startswith('TR'): hr = max(hr,int(e.split('=')[1]))
        # load ancestral chromosome calls if required
        if curchrom != chrom:
            vcf, callablelist = readVCF( ancvcf, chrom )
            if ancvcf2: vcf2, callablelist2 = readVCF( ancvcf2, chrom )
            if ancvcf3: vcf3, callablelist3 = readVCF( ancvcf3, chrom )
            if ancvcf4: vcf4, callablelist4 = readVCF( ancvcf4, chrom )
            curchrom = chrom
        # get calls in vicinity, find matching calls
        anccalls = getVCFcalls( vcf, pos-window, pos+window )
        found, other = findMatch( anccalls, pos, typesize, hr )
        callable = isCallable( pos, callablelist )

        # do same for ancvcf2
        if ancvcf2:
            anccalls2 = getVCFcalls( vcf2, pos-window, pos+window )
            found2, other2 = findMatch( anccalls2, pos, typesize, hr )
            callable2 = isCallable( pos, callablelist2 )
        else:
            found2 = found
            callable2 = callable
            other2 = other

        # and ancvcf3
        if ancvcf3:
            anccalls3 = getVCFcalls( vcf3, pos-window, pos+window )
            found3, other3 = findMatch( anccalls3, pos, typesize, hr )
            callable3 = isCallable( pos, callablelist3 )
        else:
            found3 = found
            callable3 = callable
            other3 = other

        # and ancvcf4
        if ancvcf4:
            anccalls4 = getVCFcalls( vcf4, pos-window, pos+window )
            found4, other4 = findMatch( anccalls4, pos, typesize, hr )
            callable4 = isCallable( pos, callablelist4 )
        else:
            found4 = found
            callable4 = callable
            other4 = other

        numcallable = callable + callable2 + callable3 + callable4

        # get status from one of the called alleles
        if callable:
            found0, other0 = found, other
        elif callable2:
            found0, other0 = found2, other2
        elif callable3:
            found0, other0 = found3, other3
        elif callable4:
            found0, other0 = found4, other4
        else:
            found0, other0 = False, None

        # fill in the uncallable states with one of the others
        if not callable:
            found, other = found0, other0
        if not callable2:
            found2, other2 = found0, other0
        if not callable3:
            found3, other3 = found0, other0
        if not callable4:
            found4, other4 = found0, other0

        # if sufficiently many outgroups are callable, and found
        # status is consistent, and no other indels were found, polarize.

        if numcallable >= mincallable and (found == found2) and (found == found3) and (found == found4) and \
               (other == None) and (other2 == None) and (other3 == None) and (other4 == None):
            if found:
               info = "AA=1;SZ=%s;%s" % (-typesize, info)
            else:
                info = "AA=0;SZ=%s;%s" % (typesize, info)
        else:
            info = "AA=.;SZ=%s;%s" % (typesize, info)

        # make output
        print "\t".join([chrom, str(pos), id, ref, alt, qual, filter, info] + elts[8:])
Exemplo n.º 8
0
    label = None
    description = None
    inversion = 40
    adjust = 0      # hack, to cope with Quang's off-by-one VCFs
    v = vcf.VCF()
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:c:l:d:3", ["help","output","input","ignore","concordance","label","description","quang"])
    except getopt.GetOptError, err:
        print str(err)
        help()
    for o, a in opts:
        if o in ["-h","--help"]:
            help()
            sys.exit()
        elif o in ["-o","--output"]:            outfile = open(a,'w')
        elif o in ["-i","--input"]:             infile = filez.open(a,'r')
        elif o in ["-c","--concordance"]:       secondaryin = filez.open(a,'r')
        elif o in ["-l","--label"]:             label = a
        elif o in ["-d","--description"]:       description = a
        elif o == "-3":                         inversion = 33
        elif o in ["-x","--ignore"]:            v.ignoreerror(a)
        elif o == "--quang":                    adjust = -1

    if not description or not label or not secondaryin:
        raise ValueError("Need concordance file; label; and description")
    
    # process data
    v.setversion(inversion)
    secondary = vcf.VCF(v)
    vcfstream = v.parse( infile )
Exemplo n.º 9
0
def main():
    infile = sys.stdin
    outfile = sys.stdout
    inversion = 40
    extract = None
    keepheader = False
    v = vcf.VCF( _fastGT = True )
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:e:3", ["help","output=","input=","ignore=","extract=","keepheader","parsegt"])
    except:
        help()
        raise
    for o, a in opts:
        if o in ["-h","--help"]:
            help()
            sys.exit()
        elif o in ["-o","--output"]:
            outfile = open(a,'w')
        elif o in ["-i","--input"]:
            infile = filez.open(a,'r')
        elif o == "-3":
            inversion = 33
        elif o in ["-x","--ignore"]:
            print a
            v.ignoreerror(a)
        elif o in ["-e","--extract"]:
            extract = a
        elif o in ["--keepheader"]:
            keepheader = True
        elif o in ["--parsegt"]:
            v._fastGT = False
    if not extract:
        print "Specification string required"
        help()
        sys.exit()

    if extract != "*": columns = parse_extract(extract)

    # process data
    v.setversion(inversion)
    vcfstream = v.parse( infile )

    # copy vcf header
    if keepheader: v.writeheader(outfile)

    # deal with 'all fields'
    if extract == "*": columns = allfields(v)

    # write header line
    outfile.write("\t".join( c[0] for c in columns ) + "\n")

    for data in vcfstream:
        cols = []
        for e in columns:
            if e[1] not in data:
                raise ValueError("No column found for '%s'" % e[0])
            col = data[e[1]]
            if e[1] == "pos": col += 1  # use 1-based coordinates
            if e[2] != None:
                if e[1] == "filter":
                    # see if key exists in list
                    if e[2] in col: col = [e[2]]
                    else: col = ["."]
                else:
                    # extract from dictionary
                    if type(col) != type({}): 
                        print col
                        raise ValueError("Cannot extract '%s' from column '%s'" % (e[2],e[1]))
                    col = col.get(e[2],["."])
                # Allow extraction from the single GT element
                if e[2] == "GT": col = col[0]
                if e[3] != None:
                    if len(col) > e[3]:
                        col = [col[e[3]]]
                    else:
                        col = ["."]
            # format the various types
            if type(col) in [type(""),type(0),type(0.0)]:
                col = str(col)
            elif type(col) == type({}):
                if e[0] == "INFO":
                    col = v.format_formatdata( col, v._info, separator=";" )
                else:
                    col = v.format_formatdata( col, v._format, key=False )
            elif type(col) == type([]):
                # change unextracted GT back into text format; other unextracted fields are comma-separated
                if e[2] == "GT" and len(col)==3:     col = ''.join(map(str,col))  
                elif e[1] == "ref" or e[1] == "alt": col = ','.join(map(str,col))
                else:                                col = ":".join(map(str,col))
            else:
                print "Unexpected type found: ",type(col)
                print "Value:",col
                raise ValueError("")

            cols.append(col)

        print "\t".join(cols)
Exemplo n.º 10
0
def main():
    infile = sys.stdin
    outfile = sys.stdout
    inversion = 40
    child = None
    mingq = 0
    fastGT = False
    v = vcf.VCF( leftalign=True, _fastGT=True )
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:3Fc:q:", ["help","output","input","ignore","child"])
    except:
        help()
    for o, a in opts:
        if o in ["-h","--help"]:
            help()
            sys.exit()
        elif o in ["-o","--output"]:
            outfile = open(a,'w')
        elif o in ["-i","--input"]:
            infile = filez.open(a,'r')
        elif o == "-3":
            inversion = 33
        elif o == "-F":
            fastGT = True
        elif o in ["-c","--child"]:
            child = a
        elif o in ["-q"]:
            mingq = int(a)
        elif o in ["-x","--ignore"]:
            v.ignoreerror(a)

    # check that we have a child
    if child == None: raise ValueError("Need to set child label")
    
    # process data
    v.setversion(inversion)
    v._fastGT = fastGT
    vcfstream = v.parse( infile )

    # instantiate vcfout from v, to include header and definitions.
    vcfout = vcf.VCF(v)  
    vcfout.setversion(40)
    vcfout.getinfo()['MENDELERROR'] = vcf.FORMAT('MENDELERROR',vcfout.NT_NUMBER,0,'Flag','Non-Mendelian segregation','')

    vcfout.writeheader( outfile )

    for data in vcfstream:
        samples = v.getsamples()
        if child not in samples:
            raise ValueError("Label for child (%s) not found among labels in file (%s)" % (child, ",".join(samples)))
        if len(samples) != 3:
            raise ValueError("Expect exactly 3 samples")
        parent_gts, child_gt = [], None
        gqs = []
        for s in samples:
            sampledata = data[s]
            genotype = sampledata.get('GT',['.'])[0]
            gq = sampledata.get('GQ',[0])[0]
            if gq < mingq: continue  # low genotype quality
            if len(genotype) == 1: continue    # haploid, or missing data
            if genotype[0] == "." or genotype[2] == ".": continue  # missing data
            if s == child:
                child_gt = genotype
            else:
                parent_gts.append(genotype)

        # check for mendel error
        if child_gt != None and len(parent_gts) == 2:
            if ( (child_gt[0] in [parent_gts[0][0], parent_gts[0][2]] and child_gt[2] in [parent_gts[1][0], parent_gts[1][2]]) or
                 (child_gt[2] in [parent_gts[0][0], parent_gts[0][2]] and child_gt[0] in [parent_gts[1][0], parent_gts[1][2]]) ):
                # no error
                pass
            else:
                # check genotype qualities
                
                data['info']['MENDELERROR'] = []

        vcfout.write_data( outfile, data )
Exemplo n.º 11
0
def main():
    infile = sys.stdin
    infile2 = None
    outfile = sys.stdout
    inversion = 40
    outversion = 40
    tandem = 4
    reference = None
    v = vcf.VCF( _fastGT = False )
    context = 0
    stranded = True
    mask = False
    slippage = False
    indelrate = False
    hotspot = False
    errorrate = False
    region = False
    regionir = 20
    transversion = False
    localcalls = None
    gc = 0
    repwindowsize = 0
    palindrome = 0
    palpos = None
    nohr = False
    mark = None
    try:
        opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:X:r:3mf:t:F", ["help","output=","input=","ignore=","warn=","reference=","tandem=","addcontext=","destranded","mask","filter=","slippage","gc=","indelrate","hotspot","leftalign","outversion=","region=","errorrate","repwindow=","palindrome=","transversion","localcalls=","nohr","mark=","regionir="])
    except:
        help()
        raise
    for o, a in opts:
        if o in ["-h","--help"]:
            help()
            sys.exit()
        elif o in ["-o","--output"]:
            outfile = open(a,'w')
        elif o in ["-i","--input"]:
            infile = filez.open(a,'r')
            infile2 = filez.open(a,'r')  # for localcalls
        elif o == "-3":
            inversion = 33
        elif o == "--outversion":
            outversion = int(a)
        elif o in ["-x","--ignore"]:
            v.ignoreerror(a)
        elif o in ["-X","--warn"]:
            v.warnerror(a)
        elif o in ["-r","--reference"]:
            reference = a
        elif o in ["-t","--tandem"]:
            tandem = int(a)
        elif o in ["-m","--mask"]:
            mask = "mask"
        elif o in ["-f","--filter"]:
            mask = a  
        elif o in ["--leftalign"]:
            v._leftalign = True
        elif o in ["-F"]:
            v._fastGT = True
        elif o in ["--localcalls"]:
            localcalls = map(int, a.split(','))
        elif o in ["--addcontext"]:
            context = int(a)
        elif o in ["--destranded"]:
            stranded = False
        elif o in ["--slippage"]:
            slippage = True
        elif o in ["--errorrate"]:
            errorrate = True
        elif o in ["--gc"]:
            gc = int(a)
        elif o in ["--indelrate"]:
            indelrate = True
        elif o in ["--palindrome"]:
            palindrome = int(a)
        elif o in ["--repwindow"]:
            repwindowsize = int(a)
        elif o in ["--hotspot"]:
            hotspot = True
        elif o in ["--transversion"]:
            transversion = True
        elif o in ["--nohr"]:
            nohr = True
        elif o in ["--mark"]:
            mark = a.split(',')
            assert len(mark) == 3
        elif o in ["--region"]:
            region = True
            chrom,startend = a.split(':')
            start, end = map(int, startend.split('-'))
        elif o in ["--regionir"]:
            regionir = int(a)
            
    if not reference:
        print "Reference required"
        help()
        sys.exit()

    if localcalls and not infile2:
        print "Cannot use stdin with --localcalls"

    # open reference
    fa = pysam.Fastafile( reference )
    if not mask: v.setreference(fa)
    
    # annotate region
    if region:
        if do_palindromes:
            collect_palindromes(fa, chrom, start, end, tandem, palindrome)
        else:
            annotate_region(fa, chrom, start, end, tandem, indelrate, regionir)
        return

    # read mark data
    if mark: markdata = readmark( mark[0] )

    # process data
    v.setversion(inversion)
    vcfstream = v.parse( infile , parseGenotypes=True)

    if localcalls:
        theLocalcalls = Localcalls( v, infile2, localcalls )

    # instantiate vcfout from v, to include header and definitions.
    vcfout = vcf.VCF(v)
    vcfout.setversion(outversion)
    vcfout.getheader().append( ("source",' '.join(["vcf-add-hr.py"]+sys.argv[1:])) )
    if stranded: destranded = " (stranded)"
    else:        destranded = " (unstranded)"
    if mask:
        if mask == "mask": vcfout.getinfo()['MASK'] = vcf.FORMAT('MASK',vcfout.NT_NUMBER,1,"Character","Mask",".")
        else:              vcfout.getfilter()['mask'] = vcf.FORMAT('mask',vcfout.NT_NUMBER,0,"Flag","Position masked",".")
    else:
        if not nohr:
            vcfout.getinfo()['HR'] = vcf.FORMAT('HR',vcfout.NT_NUMBER,1,"Integer","Homopolymer run length",-1)
            vcfout.getinfo()['HU'] = vcf.FORMAT('HU',vcfout.NT_NUMBER,1,"String","Homopolymer run unit%s" % destranded,-1)
            vcfout.getinfo()['TR'] = vcf.FORMAT('TR',vcfout.NT_NUMBER,1,"Integer","Tandem repeat run length (bp)",-1)
            vcfout.getinfo()['TU'] = vcf.FORMAT('TU',vcfout.NT_NUMBER,1,"String","tandem repeat run unit%s" % destranded,-1)
        if mark:
            vcfout.getinfo()[ mark[1] ] = vcf.FORMAT( mark[1], vcfout.NT_NUMBER,0,"Flag", mark[2], -1)
        if slippage:
            vcfout.getinfo()['SL'] = vcf.FORMAT('SL',vcfout.NT_NR_ALLELES,0,"Character","Indel appears to have been caused by a polymerase slippage event",".")
            vcfout.getinfo()['DR'] = vcf.FORMAT('DR',vcfout.NT_NR_ALLELES,0,"Integer","Length of direct repeat copy of long allele",-1)
        if palindrome > 0:
            vcfout.getinfo()['PAL'] = vcf.FORMAT('PAL',vcfout.NT_NUMBER,1,"Integer","Length of palindromic match between REF and first ALT allele",-1)
        if palindrome < 0:
            vcfout.getinfo()['PAL'] = vcf.FORMAT('PAL',vcfout.NT_NUMBER,1,"Integer","Length of maximum palindromic match on reference strand",-1)
        if hotspot:
            vcfout.getinfo()['IH'] = vcf.FORMAT('IH',vcfout.NT_NUMBER,1,"Character","Indel hotspot",".")
        if transversion:
            vcfout.getinfo()['TV'] = vcf.FORMAT('TV',vcfout.NT_NUMBER,1,"Integer","1 for transversions; 0 for transitions",".")
        if localcalls:
            vcfout.getinfo()['LC'] = vcf.FORMAT('LC',vcfout.NT_NUMBER,len(localcalls),"Integer","Number of local calls, in window(s) of size(s) %s" % ','.join(map(str,localcalls)), ".")
        if repwindowsize>0:
            vcfout.getinfo()['IHW'] = vcf.FORMAT('IHW',vcfout.NT_NUMBER,1,"Character","Indel hotspot in size-%s nt window" % repwindowsize, ".")
        if indelrate:
            vcfout.getinfo()["IR"] = vcf.FORMAT('IR',vcfout.NT_NUMBER,1,"Integer","Model-based location-specific indel rate, expressed as phred score indicating relative increase above base rate; e.g. 10=10x, 20=100x increase",-1)
        if errorrate:
            vcfout.getinfo()["IER"] = vcf.FORMAT('IER',vcfout.NT_NUMBER,1,"Integer","Estimated indel error rate, expressed as a Phred score of errors per read per repeat locus",-1)
        if gc>0:
            gclabel = 'GC'+str(gc)
            vcfout.getinfo()[gclabel] = vcf.FORMAT(gclabel,vcfout.NT_NUMBER,1,"Integer","GC content fraction in %s bp windows, times 1000" % gc,".")
    vcfout.writeheader( outfile )

    for data in vcfstream:
        chrom, pos = data['chrom'], data['pos']
        snp = True
        if mark:
            if pos in markdata.get(chrom, []):
                data['info'][ mark[1] ] = []
        # for indels, skip leading base
        if len(data['ref']) != 1 or sum(len(a) for a in data['alt']) != len(data['alt']):
            pos += 1
            snp = False
        if mask:
            m = fa.fetch(data['chrom'],pos,pos+1)
            if mask == "mask": data['info']['MASK'] = [m]
            elif m != mask: data['filter'].append('mask')
        else:
            if (not nohr) or errorrate or hotspot:
                homopolymer, tandemlen, homopolymerunit, tandemunit, seq = get_homopolymer_and_tandem(chrom, pos, fa, tandem, stranded)
            if not nohr:
                data['info']['HR'] = [homopolymer]
                data['info']['HU'] = [homopolymerunit]
                data['info']['TR'] = [tandemlen]
                data['info']['TU'] = [tandemunit]
            if errorrate:
                rate = get_indel_error_rate( tandemlen, tandemunit )
                data['info']['IER'] = [ rate ]
            if slippage:
                slip = [ getslippage(seq, chrom, pos, fa, data['ref'], alt) for alt in data['alt'] ]
                data['info']['SL'] = [ "NY"[sl[0]] for sl in slip ]
                data['info']['DR'] = [ sl[1] for sl in slip ]
            if repwindowsize>0:
                category = get_repetitive_window(chrom, pos, fa, repwindowsize, tandem)
                data['info']['IHW'] = [str(category)]
            if palindrome!=0:
                # negative window sizes signals the use of allele-independent palindromes (uses reference only)
                if palindrome > 0:
                    pallen, palpos = get_max_palindrome(chrom, data['pos'], fa, data['ref'], data['alt'][0], palindrome)
                else:
                    pallen, palpos = get_max_palindrome(chrom, data['pos'], fa, "", "", -palindrome)
                data['info']['PAL'] = [pallen]
            if transversion:
                if snp:
                    if data['ref']+data['alt'][0] in ["AG","GA","TC","CT"]:
                        data['info']['TV'] = [0]
                    else:
                        data['info']['TV'] = [1]
            if context!=0: 
                seq = fa.fetch(data['chrom'],data['pos']-abs(context),data['pos']+abs(context))
                # upper-case the palindrome match positions on the reference
                if palpos:
                    seq = seq.lower()
                    if palpos > -1:
                        posl, posr = max(0,palpos - (data['pos'] - abs(context))), min(palpos + pallen - (data['pos'] - abs(context)), len(seq))
                        seq = seq[:posl] + seq[posl:posr].upper() + seq[posr:]
                if context>0:
                    data['id'] = seq[:context] + "." + seq[context:] + ":" + data['id']
                else:
                    data['id'] = seq + ":" + data['id']
            if gc>0:
                ggcc = get_gc(chrom, pos, fa, gc)
                data['info'][gclabel] = [ggcc]
            if localcalls:
                data['info']['LC'] = theLocalcalls.move( chrom, data['pos'])
            if indelrate:
                indelrateest = get_indelrate(chrom, pos, fa)
                data['info']['IR'] = [indelrateest]
            if hotspot:
                cat = get_hotspot_category( homopolymer, tandemlen, tandemunit )
                data['info']['IH'] = [str(cat)]

        vcfout.write_data( outfile, data )