def readVCF( filename, chrom ): vcf = [] callablelist = [] print >>sys.stderr, "#### Loading vcf for choromosome",chrom for line in io.BufferedReader( filez.open(filename,'r') ): if line.startswith('#'): continue if not line.startswith( chrom + "\t"): continue elts = line[:-1].split('\t') chrom, pos, id, ref, alt, qual, filt, info = elts[:8] start, end, callable, hr = -1, -1, False, 1 for e in info.split(';'): if e.startswith('START'): start = int(e.split('=')[1]) if e.startswith('END'): end = int(e.split('=')[1]) if e.startswith('CALLABLE'): callable = True if e.startswith('HR') or e.startswith('TR'): hr = max(hr,int(e.split('=')[1])) if callable: if len(callablelist) >0 and callablelist[-1]+1 == start: callablelist[-1] = end else: callablelist.append( start ) callablelist.append( end ) else: vcf.append( Indel(int(pos),ref,alt,start,end,hr) ) print>>sys.stderr, "#### Loaded ",len(vcf)," calls" vcf.sort() return vcf, callablelist
def readmark( filename ): data = {} for line in filez.open(filename,'r'): chrom, pos = line[:-1].split('\t') if chrom not in data: data[chrom] = [] data[chrom].append(int(pos) - 1) # convert into 0-based coords for chrom in data.keys(): data[chrom] = set(data[chrom]) return data
def main(): infile = sys.stdin outfile = sys.stdout inversion = 40 fastGT = False v = vcf.VCF(leftalign=True, _fastGT=True) try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:3F", ["help", "output", "input", "ignore"]) except: help() for o, a in opts: if o in ["-h", "--help"]: help() sys.exit() elif o in ["-o", "--output"]: outfile = open(a, "w") elif o in ["-i", "--input"]: infile = filez.open(a, "r") elif o == "-3": inversion = 33 elif o == "-F": fastGT = True elif o in ["-x", "--ignore"]: v.ignoreerror(a) # process data v.setversion(inversion) v._fastGT = fastGT vcfstream = v.parse(infile) # instantiate vcfout from v, to include header and definitions. vcfout = vcf.VCF(v) vcfout.setversion(40) vcfout.getinfo()["AC"] = vcf.FORMAT("AC", vcfout.NT_ALLELES, -1, "Integer", "Allele counts", -1) vcfout.writeheader(outfile) for data in vcfstream: samples = v.getsamples() ac = {} for s in samples: sampledata = data[s] genotype = sampledata.get("GT", ["."])[0] if len(genotype) == 1: continue # haploid, or missing data for idx in 0, 2: if genotype[idx] == ".": continue ac[genotype[idx]] = ac.get(genotype[idx], 0) + 1 if len(ac.keys()) > 0: data["info"]["AC"] = [ac.get(i, 0) for i in range(len(data["alt"]) + 1)] vcfout.write_data(outfile, data)
def processVCF( invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4, window=50 ): invcf = filez.open(invcf,'r') curchrom = None for line in invcf: # copy header if line.startswith('#'): # remove any existing AA definition if line.startswith('##INFO=<ID=AA'): continue if removeHR: if line.startswith('##INFO=<ID=HR'): continue if line.startswith('##INFO=<ID=HU'): continue if line.startswith('##INFO=<ID=TR'): continue if line.startswith('##INFO=<ID=TU'): continue if line.startswith('#CHROM'): ## add info fields if reportIdx: print '##INFO=<ID=AA, Number=1, Type=Integer, Description="Ancestral Allele (0=reference, 1=first alternative allele, etc.)">' else: print '##INFO=<ID=AA, Number=1, Type=String, Description="Ancestral Allele">' if includeSZ: print '##INFO=<ID=SZ, Number=1, Type=Integer, Description="Indel size and type (polarized if AA=0,1; with respect to reference if AA=.)">' if findHotspot: print "##INFO=<ID=OTHANC, Number=0, Type=Flag, Description=""Found ancestral alleles other than reference and variant; potential hotspot"">" print "##INFO=<ID=INCANC, Number=0, Type=Flag, Description=""Found ancestral alleles inconsistent with mutation just on human lineage"">" #print "##source=vcf-indel-polarize.py %s %s %s %s %s" % (invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4) print line[:-1] continue # get data elts = line[:-1].split('\t') chrom, pos, id, ref, altlist, qual, filt, info = (elts + [""])[:8] pos = int(pos) chrom = chromprefix + chrom hr = 1 aafieldlen = 0 newinfo = [] for e in info.split(';'): if e.startswith('HR') or e.startswith('TR'): hr = max(hr,int(e.split('=')[1])) if e.startswith('AA'): aafieldlen = len(e) if removeHR: if not (e.startswith('HR') or e.startswith('TR') or e.startswith('HU') or e.startswith('TU')): newinfo.append( e ) if removeHR: info = ';'.join(newinfo) # load ancestral chromosome calls if required if curchrom != chrom: vcf, callablelist = readVCF( ancvcf, chrom ) if ancvcf2: vcf2, callablelist2 = readVCF( ancvcf2, chrom ) if ancvcf3: vcf3, callablelist3 = readVCF( ancvcf3, chrom ) if ancvcf4: vcf4, callablelist4 = readVCF( ancvcf4, chrom ) curchrom = chrom # make sure no biallelics assert altlist.find(',') == -1 # do not touch SNP calls; leave any AA call in (but remove HR/TR/TU/HU if requested) if (len(ref)==1 and len(altlist) == 1) and not snps: print "\t".join([chrom, str(pos), id, ref, altlist, qual, filt, info] + elts[8:]) continue # all others, remove AA call if aafieldlen > 0: # remove any existing AA annotation idx = info.index('AA=') info = info[:idx] + info[idx + aafieldlen + 1:] # make list of alternative alleles, and loop over them polarized = False for idx,alt in enumerate(altlist.split(',')): alleleidx = idx+1 ref, alt = convert(ref,alt) typesize = len(alt)-len(ref) if max(len(alt), len(ref)) > maxindellen: # long deletions (or insertions); we can't deal with them, so report as AA=. continue if alt == "": # <DEL> call -- report as AA=. continue # get calls in vicinity, find matching calls anccalls = getVCFcalls( vcf, pos-window, pos+window ) found, other = findMatch( anccalls, pos, typesize, alt, hr ) callable = isCallable( pos, callablelist ) # do same for ancvcf2 if ancvcf2: anccalls2 = getVCFcalls( vcf2, pos-window, pos+window ) found2, other2 = findMatch( anccalls2, pos, typesize, alt, hr ) callable2 = isCallable( pos, callablelist2 ) else: found2 = found callable2 = False other2 = other # and ancvcf3 if ancvcf3: anccalls3 = getVCFcalls( vcf3, pos-window, pos+window ) found3, other3 = findMatch( anccalls3, pos, typesize, alt, hr ) callable3 = isCallable( pos, callablelist3 ) else: found3 = found callable3 = False other3 = other # and ancvcf4 if ancvcf4: anccalls4 = getVCFcalls( vcf4, pos-window, pos+window ) found4, other4 = findMatch( anccalls4, pos, typesize, alt, hr ) callable4 = isCallable( pos, callablelist4 ) else: found4 = found callable4 = False other4 = other numcallable = callable + callable2 + callable3 + callable4 # get status from one of the called alleles if callable: found0, other0 = found, other elif callable2: found0, other0 = found2, other2 elif callable3: found0, other0 = found3, other3 elif callable4: found0, other0 = found4, other4 else: found0, other0 = False, None # fill in the uncallable states with one of the others if not callable: found, other = found0, other0 if not callable2: found2, other2 = found0, other0 if not callable3: found3, other3 = found0, other0 if not callable4: found4, other4 = found0, other0 # if sufficiently many outgroups are callable, and found # status is consistent, and no other indels were found, polarize. if found: sz = -typesize if reportIdx: aallele = alleleidx else: aallele = alt else: sz = typesize if reportIdx: aallele = 0 else: aallele = ref if numcallable >= mincallable and (found == found2) and (found == found3) and (found == found4) and \ (other == None) and (other2 == None) and (other3 == None) and (other4 == None): if includeSZ: info = "AA=%s;SZ=%s;%s" % (aallele, sz, info) else: info = "AA=%s;%s" % (aallele, info) # done with the loop over alleles polarized = True break if not polarized: info = "AA=.;%s" % info if numcallable >= mincallable and findHotspot: if (other != None) or (other2 != None) or (other3 != None) or (other4 != None): info += ";OTHANC" # other ancestral alleles than reference and variant -- hotspot! if (found != found2) or (found != found3) or (found != found4): info += ";INCANC" # ancestry inconsistent with mutation happening on human branch only -- hotspot, or incomplete lineage sorting # make output if info.endswith(";."): info = info[:-2] print "\t".join([chrom, str(pos), id, ref, altlist, qual, filt, info] + elts[8:])
def main(): infile = sys.stdin outfile = sys.stdout reference = None sample = "sample" maxsnps = 10 maxindels = 5 maxwindow = 1000 v = vcf.VCF( _fastGT = False ) try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:r:s:n:", ["help","output=","input=","reference=","sample="]) except: help() raise for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-i","--input"]: infile = filez.open(a,'r') elif o in ["-r","--reference"]: reference = a elif o in ["-s"]: maxsnps = int(a) elif o in ["-n"]: maxindels = int(a) elif o in ["--sample"]: sample = a if not reference: print "Reference required" help() sys.exit() # open reference fa = pysam.Fastafile( reference ) v.setreference(fa) # instantiate vcfout from v, to include header and definitions. vcfout = vcf.VCF(v) vcfout.setversion(41) vcfout.getheader().append( ("source",' '.join(sys.argv))) vcfout.getinfo()['RN'] = vcf.FORMAT('RN', vcfout.NT_NUMBER, 1, "String", "Name of read supporting the variant", -1) vcfout.getfilter()['HighCallDensity'] = vcf.FORMAT('mask',vcfout.NT_NUMBER,0,"Flag","Number of indels/snps exceeding %s/%s in %s bp window" % (maxindels,maxsnps,maxwindow),".") vcfout.setsamples( [sample] ) vcfout.writeheader( outfile ) for data in sys.stdin: if data.startswith("#"): continue readname, flag, chrom, pos, mapq, cigar, mchrom, mpos, isize, seq, qual = data[:-1].split('\t')[:11] vcflist = [] vcfdata = {'chrom':chrom, 'id':'.', 'qual':100, 'filter': ["HighCallDensity"], 'info': {'RN': [readname]}, 'format': ['GT'], sample:{'GT':["1"]}} rpos = int(pos)-1 spos = 0 for i in re.finditer( "[0-9]*[MIDSNHP]", cigar ): ash = i.group(0) num = int( ash[:-1] ) if ash[-1] == 'P' or ash[-1] == 'H': pass elif ash[-1] == 'M': refseq = fa.fetch(chrom,rpos,rpos+num) for idx,c in enumerate(refseq): c = c.upper() if c != 'N' and seq[spos+idx] != 'N' and c != seq[spos+idx]: # found a SNP vcfdata['pos'] = rpos + idx vcfdata['ref'] = c vcfdata['alt'] = [seq[spos+idx]] vcflist.append( vcfdata.copy() ) spos += num rpos += num elif ash[-1] == 'S': spos += num elif ash[-1] == 'I': vcfdata['pos'] = rpos - 1 vcfdata['ref'] = fa.fetch(chrom, rpos-1, rpos) vcfdata['alt'] = [fa.fetch(chrom, rpos-1, rpos) + seq[spos:spos+num]] vcflist.append( vcfdata.copy() ) spos += num elif ash[-1] == 'D': vcfdata['pos'] = rpos - 1 vcfdata['ref'] = fa.fetch(chrom, rpos-1, rpos + num) vcfdata['alt'] = [fa.fetch(chrom, rpos-1, rpos)] vcflist.append( vcfdata.copy() ) rpos += num # see if snp or indel density is too high windowidx = 0 for idx,call in enumerate(vcflist): while vcflist[windowidx]['pos'] < call['pos'] - maxwindow: windowidx += 1 indels = len( [ c for c in vcflist[windowidx:idx+1] if len(c['ref']+c['alt'][0])>2 ] ) snps = len( [ c for c in vcflist[windowidx:idx+1] if len(c['ref']+c['alt'][0])==2 ] ) if indels > maxindels or snps > maxsnps: break else: for call in vcflist: call['filter'] = None for call in vcflist: vcfout.write_data( outfile, call )
def main(): infile = sys.stdin outfile = sys.stdout inversion = 40 maxn = 1e10 fastGT = False v = vcf.VCF( leftalign=True, _fastGT=True ) try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:3Fm:", ["help","output","input","ignore"]) except: help() for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-i","--input"]: infile = filez.open(a,'r') elif o == "-3": inversion = 33 elif o == "-F": fastGT = True elif o == "-m": maxn = int(a) elif o in ["-x","--ignore"]: v.ignoreerror(a) # process data v.setversion(inversion) v._fastGT = fastGT vcfstream = v.parse( infile ) # instantiate vcfout from v, to include header and definitions. vcfout = vcf.VCF(v) vcfout.setversion(40) vcfout.getinfo()['LABELS'] = vcf.FORMAT('LABELS',vcfout.NT_UNKNOWN,-1,'String','Non-hom-ref samples','') vcfout.writeheader( outfile ) for data in vcfstream: samples = v.getsamples() labels = {} for s in samples: sampledata = data[s] genotype = sampledata.get('GT',['.'])[0] if len(genotype) == 1: continue # haploid, or missing data acref = 0 for idx in 0,2: g = genotype[idx] if g == "." or g == 0: continue if g not in labels: labels[g] = set() labels[g].add( s ) report = [] for g in labels: if 0 < len( labels[g] ) <= maxn: report.append( "%s:%s" % (g,",".join(labels[g])) ) else: report.append( "%s:%s" % (g,len(labels[g])) ) data['info']['LABELS'] = report vcfout.write_data( outfile, data )
def processVCF( invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4, window=50 ): invcf = filez.open(invcf,'r') curchrom = None for line in invcf: # copy header if line.startswith('#'): if line.startswith('#CHROM'): ## add info fields print "##INFO=<ID=AA, Number=1, Type=String, Description=""Ancestral Allele"">" print "##INFO=<ID=SZ, Number=1, Type=Integer, Description=""Indel size and type"">" print "##source=vcf-indel-polarize.py %s %s %s %s %s" % (invcf, ancvcf, ancvcf2, ancvcf3, ancvcf4) print line[:-1] continue # get data elts = line[:-1].split('\t') chrom, pos, id, ref, alt, qual, filter, info = elts[:8] pos = int(pos) chrom = chromprefix + chrom ref, alt = convert(ref,alt) typesize = len(alt)-len(ref) if typesize != 0: # not a snp -- adjust coordinate pos -= vcf_offset hr = 1 for e in info.split(';'): if e.startswith('HR') or e.startswith('TR'): hr = max(hr,int(e.split('=')[1])) # load ancestral chromosome calls if required if curchrom != chrom: vcf, callablelist = readVCF( ancvcf, chrom ) if ancvcf2: vcf2, callablelist2 = readVCF( ancvcf2, chrom ) if ancvcf3: vcf3, callablelist3 = readVCF( ancvcf3, chrom ) if ancvcf4: vcf4, callablelist4 = readVCF( ancvcf4, chrom ) curchrom = chrom # get calls in vicinity, find matching calls anccalls = getVCFcalls( vcf, pos-window, pos+window ) found, other = findMatch( anccalls, pos, typesize, hr ) callable = isCallable( pos, callablelist ) # do same for ancvcf2 if ancvcf2: anccalls2 = getVCFcalls( vcf2, pos-window, pos+window ) found2, other2 = findMatch( anccalls2, pos, typesize, hr ) callable2 = isCallable( pos, callablelist2 ) else: found2 = found callable2 = callable other2 = other # and ancvcf3 if ancvcf3: anccalls3 = getVCFcalls( vcf3, pos-window, pos+window ) found3, other3 = findMatch( anccalls3, pos, typesize, hr ) callable3 = isCallable( pos, callablelist3 ) else: found3 = found callable3 = callable other3 = other # and ancvcf4 if ancvcf4: anccalls4 = getVCFcalls( vcf4, pos-window, pos+window ) found4, other4 = findMatch( anccalls4, pos, typesize, hr ) callable4 = isCallable( pos, callablelist4 ) else: found4 = found callable4 = callable other4 = other numcallable = callable + callable2 + callable3 + callable4 # get status from one of the called alleles if callable: found0, other0 = found, other elif callable2: found0, other0 = found2, other2 elif callable3: found0, other0 = found3, other3 elif callable4: found0, other0 = found4, other4 else: found0, other0 = False, None # fill in the uncallable states with one of the others if not callable: found, other = found0, other0 if not callable2: found2, other2 = found0, other0 if not callable3: found3, other3 = found0, other0 if not callable4: found4, other4 = found0, other0 # if sufficiently many outgroups are callable, and found # status is consistent, and no other indels were found, polarize. if numcallable >= mincallable and (found == found2) and (found == found3) and (found == found4) and \ (other == None) and (other2 == None) and (other3 == None) and (other4 == None): if found: info = "AA=1;SZ=%s;%s" % (-typesize, info) else: info = "AA=0;SZ=%s;%s" % (typesize, info) else: info = "AA=.;SZ=%s;%s" % (typesize, info) # make output print "\t".join([chrom, str(pos), id, ref, alt, qual, filter, info] + elts[8:])
label = None description = None inversion = 40 adjust = 0 # hack, to cope with Quang's off-by-one VCFs v = vcf.VCF() try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:c:l:d:3", ["help","output","input","ignore","concordance","label","description","quang"]) except getopt.GetOptError, err: print str(err) help() for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-i","--input"]: infile = filez.open(a,'r') elif o in ["-c","--concordance"]: secondaryin = filez.open(a,'r') elif o in ["-l","--label"]: label = a elif o in ["-d","--description"]: description = a elif o == "-3": inversion = 33 elif o in ["-x","--ignore"]: v.ignoreerror(a) elif o == "--quang": adjust = -1 if not description or not label or not secondaryin: raise ValueError("Need concordance file; label; and description") # process data v.setversion(inversion) secondary = vcf.VCF(v) vcfstream = v.parse( infile )
def main(): infile = sys.stdin outfile = sys.stdout inversion = 40 extract = None keepheader = False v = vcf.VCF( _fastGT = True ) try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:e:3", ["help","output=","input=","ignore=","extract=","keepheader","parsegt"]) except: help() raise for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-i","--input"]: infile = filez.open(a,'r') elif o == "-3": inversion = 33 elif o in ["-x","--ignore"]: print a v.ignoreerror(a) elif o in ["-e","--extract"]: extract = a elif o in ["--keepheader"]: keepheader = True elif o in ["--parsegt"]: v._fastGT = False if not extract: print "Specification string required" help() sys.exit() if extract != "*": columns = parse_extract(extract) # process data v.setversion(inversion) vcfstream = v.parse( infile ) # copy vcf header if keepheader: v.writeheader(outfile) # deal with 'all fields' if extract == "*": columns = allfields(v) # write header line outfile.write("\t".join( c[0] for c in columns ) + "\n") for data in vcfstream: cols = [] for e in columns: if e[1] not in data: raise ValueError("No column found for '%s'" % e[0]) col = data[e[1]] if e[1] == "pos": col += 1 # use 1-based coordinates if e[2] != None: if e[1] == "filter": # see if key exists in list if e[2] in col: col = [e[2]] else: col = ["."] else: # extract from dictionary if type(col) != type({}): print col raise ValueError("Cannot extract '%s' from column '%s'" % (e[2],e[1])) col = col.get(e[2],["."]) # Allow extraction from the single GT element if e[2] == "GT": col = col[0] if e[3] != None: if len(col) > e[3]: col = [col[e[3]]] else: col = ["."] # format the various types if type(col) in [type(""),type(0),type(0.0)]: col = str(col) elif type(col) == type({}): if e[0] == "INFO": col = v.format_formatdata( col, v._info, separator=";" ) else: col = v.format_formatdata( col, v._format, key=False ) elif type(col) == type([]): # change unextracted GT back into text format; other unextracted fields are comma-separated if e[2] == "GT" and len(col)==3: col = ''.join(map(str,col)) elif e[1] == "ref" or e[1] == "alt": col = ','.join(map(str,col)) else: col = ":".join(map(str,col)) else: print "Unexpected type found: ",type(col) print "Value:",col raise ValueError("") cols.append(col) print "\t".join(cols)
def main(): infile = sys.stdin outfile = sys.stdout inversion = 40 child = None mingq = 0 fastGT = False v = vcf.VCF( leftalign=True, _fastGT=True ) try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:3Fc:q:", ["help","output","input","ignore","child"]) except: help() for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-i","--input"]: infile = filez.open(a,'r') elif o == "-3": inversion = 33 elif o == "-F": fastGT = True elif o in ["-c","--child"]: child = a elif o in ["-q"]: mingq = int(a) elif o in ["-x","--ignore"]: v.ignoreerror(a) # check that we have a child if child == None: raise ValueError("Need to set child label") # process data v.setversion(inversion) v._fastGT = fastGT vcfstream = v.parse( infile ) # instantiate vcfout from v, to include header and definitions. vcfout = vcf.VCF(v) vcfout.setversion(40) vcfout.getinfo()['MENDELERROR'] = vcf.FORMAT('MENDELERROR',vcfout.NT_NUMBER,0,'Flag','Non-Mendelian segregation','') vcfout.writeheader( outfile ) for data in vcfstream: samples = v.getsamples() if child not in samples: raise ValueError("Label for child (%s) not found among labels in file (%s)" % (child, ",".join(samples))) if len(samples) != 3: raise ValueError("Expect exactly 3 samples") parent_gts, child_gt = [], None gqs = [] for s in samples: sampledata = data[s] genotype = sampledata.get('GT',['.'])[0] gq = sampledata.get('GQ',[0])[0] if gq < mingq: continue # low genotype quality if len(genotype) == 1: continue # haploid, or missing data if genotype[0] == "." or genotype[2] == ".": continue # missing data if s == child: child_gt = genotype else: parent_gts.append(genotype) # check for mendel error if child_gt != None and len(parent_gts) == 2: if ( (child_gt[0] in [parent_gts[0][0], parent_gts[0][2]] and child_gt[2] in [parent_gts[1][0], parent_gts[1][2]]) or (child_gt[2] in [parent_gts[0][0], parent_gts[0][2]] and child_gt[0] in [parent_gts[1][0], parent_gts[1][2]]) ): # no error pass else: # check genotype qualities data['info']['MENDELERROR'] = [] vcfout.write_data( outfile, data )
def main(): infile = sys.stdin infile2 = None outfile = sys.stdout inversion = 40 outversion = 40 tandem = 4 reference = None v = vcf.VCF( _fastGT = False ) context = 0 stranded = True mask = False slippage = False indelrate = False hotspot = False errorrate = False region = False regionir = 20 transversion = False localcalls = None gc = 0 repwindowsize = 0 palindrome = 0 palpos = None nohr = False mark = None try: opts, args = getopt.getopt(sys.argv[1:], "ho:i:x:X:r:3mf:t:F", ["help","output=","input=","ignore=","warn=","reference=","tandem=","addcontext=","destranded","mask","filter=","slippage","gc=","indelrate","hotspot","leftalign","outversion=","region=","errorrate","repwindow=","palindrome=","transversion","localcalls=","nohr","mark=","regionir="]) except: help() raise for o, a in opts: if o in ["-h","--help"]: help() sys.exit() elif o in ["-o","--output"]: outfile = open(a,'w') elif o in ["-i","--input"]: infile = filez.open(a,'r') infile2 = filez.open(a,'r') # for localcalls elif o == "-3": inversion = 33 elif o == "--outversion": outversion = int(a) elif o in ["-x","--ignore"]: v.ignoreerror(a) elif o in ["-X","--warn"]: v.warnerror(a) elif o in ["-r","--reference"]: reference = a elif o in ["-t","--tandem"]: tandem = int(a) elif o in ["-m","--mask"]: mask = "mask" elif o in ["-f","--filter"]: mask = a elif o in ["--leftalign"]: v._leftalign = True elif o in ["-F"]: v._fastGT = True elif o in ["--localcalls"]: localcalls = map(int, a.split(',')) elif o in ["--addcontext"]: context = int(a) elif o in ["--destranded"]: stranded = False elif o in ["--slippage"]: slippage = True elif o in ["--errorrate"]: errorrate = True elif o in ["--gc"]: gc = int(a) elif o in ["--indelrate"]: indelrate = True elif o in ["--palindrome"]: palindrome = int(a) elif o in ["--repwindow"]: repwindowsize = int(a) elif o in ["--hotspot"]: hotspot = True elif o in ["--transversion"]: transversion = True elif o in ["--nohr"]: nohr = True elif o in ["--mark"]: mark = a.split(',') assert len(mark) == 3 elif o in ["--region"]: region = True chrom,startend = a.split(':') start, end = map(int, startend.split('-')) elif o in ["--regionir"]: regionir = int(a) if not reference: print "Reference required" help() sys.exit() if localcalls and not infile2: print "Cannot use stdin with --localcalls" # open reference fa = pysam.Fastafile( reference ) if not mask: v.setreference(fa) # annotate region if region: if do_palindromes: collect_palindromes(fa, chrom, start, end, tandem, palindrome) else: annotate_region(fa, chrom, start, end, tandem, indelrate, regionir) return # read mark data if mark: markdata = readmark( mark[0] ) # process data v.setversion(inversion) vcfstream = v.parse( infile , parseGenotypes=True) if localcalls: theLocalcalls = Localcalls( v, infile2, localcalls ) # instantiate vcfout from v, to include header and definitions. vcfout = vcf.VCF(v) vcfout.setversion(outversion) vcfout.getheader().append( ("source",' '.join(["vcf-add-hr.py"]+sys.argv[1:])) ) if stranded: destranded = " (stranded)" else: destranded = " (unstranded)" if mask: if mask == "mask": vcfout.getinfo()['MASK'] = vcf.FORMAT('MASK',vcfout.NT_NUMBER,1,"Character","Mask",".") else: vcfout.getfilter()['mask'] = vcf.FORMAT('mask',vcfout.NT_NUMBER,0,"Flag","Position masked",".") else: if not nohr: vcfout.getinfo()['HR'] = vcf.FORMAT('HR',vcfout.NT_NUMBER,1,"Integer","Homopolymer run length",-1) vcfout.getinfo()['HU'] = vcf.FORMAT('HU',vcfout.NT_NUMBER,1,"String","Homopolymer run unit%s" % destranded,-1) vcfout.getinfo()['TR'] = vcf.FORMAT('TR',vcfout.NT_NUMBER,1,"Integer","Tandem repeat run length (bp)",-1) vcfout.getinfo()['TU'] = vcf.FORMAT('TU',vcfout.NT_NUMBER,1,"String","tandem repeat run unit%s" % destranded,-1) if mark: vcfout.getinfo()[ mark[1] ] = vcf.FORMAT( mark[1], vcfout.NT_NUMBER,0,"Flag", mark[2], -1) if slippage: vcfout.getinfo()['SL'] = vcf.FORMAT('SL',vcfout.NT_NR_ALLELES,0,"Character","Indel appears to have been caused by a polymerase slippage event",".") vcfout.getinfo()['DR'] = vcf.FORMAT('DR',vcfout.NT_NR_ALLELES,0,"Integer","Length of direct repeat copy of long allele",-1) if palindrome > 0: vcfout.getinfo()['PAL'] = vcf.FORMAT('PAL',vcfout.NT_NUMBER,1,"Integer","Length of palindromic match between REF and first ALT allele",-1) if palindrome < 0: vcfout.getinfo()['PAL'] = vcf.FORMAT('PAL',vcfout.NT_NUMBER,1,"Integer","Length of maximum palindromic match on reference strand",-1) if hotspot: vcfout.getinfo()['IH'] = vcf.FORMAT('IH',vcfout.NT_NUMBER,1,"Character","Indel hotspot",".") if transversion: vcfout.getinfo()['TV'] = vcf.FORMAT('TV',vcfout.NT_NUMBER,1,"Integer","1 for transversions; 0 for transitions",".") if localcalls: vcfout.getinfo()['LC'] = vcf.FORMAT('LC',vcfout.NT_NUMBER,len(localcalls),"Integer","Number of local calls, in window(s) of size(s) %s" % ','.join(map(str,localcalls)), ".") if repwindowsize>0: vcfout.getinfo()['IHW'] = vcf.FORMAT('IHW',vcfout.NT_NUMBER,1,"Character","Indel hotspot in size-%s nt window" % repwindowsize, ".") if indelrate: vcfout.getinfo()["IR"] = vcf.FORMAT('IR',vcfout.NT_NUMBER,1,"Integer","Model-based location-specific indel rate, expressed as phred score indicating relative increase above base rate; e.g. 10=10x, 20=100x increase",-1) if errorrate: vcfout.getinfo()["IER"] = vcf.FORMAT('IER',vcfout.NT_NUMBER,1,"Integer","Estimated indel error rate, expressed as a Phred score of errors per read per repeat locus",-1) if gc>0: gclabel = 'GC'+str(gc) vcfout.getinfo()[gclabel] = vcf.FORMAT(gclabel,vcfout.NT_NUMBER,1,"Integer","GC content fraction in %s bp windows, times 1000" % gc,".") vcfout.writeheader( outfile ) for data in vcfstream: chrom, pos = data['chrom'], data['pos'] snp = True if mark: if pos in markdata.get(chrom, []): data['info'][ mark[1] ] = [] # for indels, skip leading base if len(data['ref']) != 1 or sum(len(a) for a in data['alt']) != len(data['alt']): pos += 1 snp = False if mask: m = fa.fetch(data['chrom'],pos,pos+1) if mask == "mask": data['info']['MASK'] = [m] elif m != mask: data['filter'].append('mask') else: if (not nohr) or errorrate or hotspot: homopolymer, tandemlen, homopolymerunit, tandemunit, seq = get_homopolymer_and_tandem(chrom, pos, fa, tandem, stranded) if not nohr: data['info']['HR'] = [homopolymer] data['info']['HU'] = [homopolymerunit] data['info']['TR'] = [tandemlen] data['info']['TU'] = [tandemunit] if errorrate: rate = get_indel_error_rate( tandemlen, tandemunit ) data['info']['IER'] = [ rate ] if slippage: slip = [ getslippage(seq, chrom, pos, fa, data['ref'], alt) for alt in data['alt'] ] data['info']['SL'] = [ "NY"[sl[0]] for sl in slip ] data['info']['DR'] = [ sl[1] for sl in slip ] if repwindowsize>0: category = get_repetitive_window(chrom, pos, fa, repwindowsize, tandem) data['info']['IHW'] = [str(category)] if palindrome!=0: # negative window sizes signals the use of allele-independent palindromes (uses reference only) if palindrome > 0: pallen, palpos = get_max_palindrome(chrom, data['pos'], fa, data['ref'], data['alt'][0], palindrome) else: pallen, palpos = get_max_palindrome(chrom, data['pos'], fa, "", "", -palindrome) data['info']['PAL'] = [pallen] if transversion: if snp: if data['ref']+data['alt'][0] in ["AG","GA","TC","CT"]: data['info']['TV'] = [0] else: data['info']['TV'] = [1] if context!=0: seq = fa.fetch(data['chrom'],data['pos']-abs(context),data['pos']+abs(context)) # upper-case the palindrome match positions on the reference if palpos: seq = seq.lower() if palpos > -1: posl, posr = max(0,palpos - (data['pos'] - abs(context))), min(palpos + pallen - (data['pos'] - abs(context)), len(seq)) seq = seq[:posl] + seq[posl:posr].upper() + seq[posr:] if context>0: data['id'] = seq[:context] + "." + seq[context:] + ":" + data['id'] else: data['id'] = seq + ":" + data['id'] if gc>0: ggcc = get_gc(chrom, pos, fa, gc) data['info'][gclabel] = [ggcc] if localcalls: data['info']['LC'] = theLocalcalls.move( chrom, data['pos']) if indelrate: indelrateest = get_indelrate(chrom, pos, fa) data['info']['IR'] = [indelrateest] if hotspot: cat = get_hotspot_category( homopolymer, tandemlen, tandemunit ) data['info']['IH'] = [str(cat)] vcfout.write_data( outfile, data )