def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None) parser.add_option("--addchr", action="store_true", dest="addchr", help="pre-pend 'chr' to chrom column ", default=False) parser.add_option("--siteinfo", action="store_true", dest="siteinfo", help="use if vcf only has site information and lacks FORMAT column") parser.add_option("--dump", action="store_true", dest="dump", help="dump everything after teh ID column in the 4th bed column") parser.add_option("--chr", type="string", dest="chr", default=None, help="restrct to chromosome number specified by --chr") (options, args)=parser.parse_args() vcfilename=args[0] #basename, extension = os.path.splitext(vcfilename) #bedfile=basename+".bed" #bedfh=open(bedfile,'w') vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) for dataline in vcfobj.yieldVcfDataLine(vcfh): fields=dataline.strip().split('\t') if options.siteinfo == True: (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8] else: (chrom,pos,id,ref,alt,qual,filtercode,info,format)=fields[0:9] if options.chr != None and chrom != options.chr: continue if options.addchr ==True: chrom='chr'+chrom if filtercode != options.filter and options.filter != None : continue (start,end) = (int(pos)-1, int(pos)) if options.dump == True: # @type options if options.siteinfo == True: gstrings=",".join(fields[8::]) else: gstrings=",".join(fields[9::]) dumpstring="".join([ref,alt,qual,filtercode,info,gstrings]) bedstring= "\t".join( [ chrom, str(start), str(end), id ,dumpstring] ) else: bedstring= "\t".join( [ chrom, str(start), str(end), id] ) print bedstring
def main(): usage = "usage: %prog [options] vcf_file_one vcf|bed_file_two\n\nFind regions in the first vcf file that overlap regions of the second vcf or bed file\n" parser = OptionParser(usage) parser.add_option("--minCols", type="int", dest="mincols", default=1, help="mininum basepair overlap (default is one)") parser.add_option("--v", action="store_true", dest="reverse", help="Print regions in first vcf that DO NOT overlap second vcf|bed file") parser.add_option("--filter", type="string", dest="filter", default=None, help="intersect records only set with filter (default is None") parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE") parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file one has no header line", default=False) parser.add_option("--nochrprefix", action="store_false", dest="chrprefix", help="use if the bed doesn't have chr prefix in chrom column", default=True) (options, args)=parser.parse_args() sys.stderr.write("intersecting two files ...\n") vcf_file_one=args[0] in2_fname=args[1] in2_fname_ext= os.path.splitext(in2_fname)[1][1:] if "bed" == in2_fname_ext: bitsets = binned_bitsets_from_file( open( in2_fname ) ) if "vcf" == in2_fname_ext: bitsets = binned_bitsets_from_vcffile( in2_fname , options.filter) vcfobj=VcfFile(vcf_file_one) vcfh=open(vcf_file_one,'r') if options.noheader == False: vcfobj.parseMetaAndHeaderLines(vcfh) header=vcfobj.returnHeader() #print header #vcfobj.parseMetaAndHeaderLines(vcfh) #descriptors = vcfobj.getMetaInfoDescription() #infoids=[] #for (tag, description) in descriptors: # infoids.append(tag) #if options.infotag not in infoids and options.infotag != 'QUAL' and options.infotag != "" and options.noheader == False: # sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n") # exit(1) print header for dataline in vcfobj.yieldVcfDataLine(vcfh): fields=dataline.strip().split('\t') (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8] (start,end) = (int(pos)-1, int(pos)) #pass the filter code if filtercode != options.filter and options.filter != None: continue #check to see if record is the correct variant TYPE if options.variantype != None: pattern=options.infotag+'=('+options.variantype+')' if re.search(pattern, info ) == None: continue if options.chrprefix == True: chrom="chr"+chrom if chrom in bitsets and bitsets[chrom].count_range( start, end-start ) >= options.mincols: if not options.reverse: print dataline else: if options.reverse == True: print dataline
def main(): usage = "usage: %prog [options] maf file.vcf" parser = OptionParser(usage) parser.add_option("--maftag", type="string", dest="maftag", help="INFO tag id that annotates the allele freq of the record", default="AF") parser.add_option("--variantag", type="string", dest="vtag", help="INFO tag that annotates the type of variant type", default="VT") parser.add_option("--variantype", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None) parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file has no header file", default=False) parser.add_option("--quiet", action="store_true", dest="quiet", help="don't print vcf output to stdout", default=False) parser.add_option("--leq", type="float", dest="leq", default=1.0, help="keep variants with AF <= (default 1)") parser.add_option("--geq", type="float", dest="geq", default=0.0, help="keep variants with AF >= (default 0)") (options, args)=parser.parse_args() if len(args)!=1: sys.stderr.write(usage+"\n") exit(1) vcfilename=args[0] #maf=float(args[0]) freqfh=open('freq.log', 'w') vcfh=open(vcfilename,'r') #instantiate a VcfFile object vcfobj=VcfFile(vcfilename) #parse its metainfo lines (ones that begin with ##) if options.noheader == False: vcfobj.parseMetaLines(vcfh) #vcfobj.printMetaLines() descriptors = vcfobj.getMetaInfoDescription() infoids=[] for (tag, description) in descriptors: infoids.append(tag) if options.maftag not in infoids and options.maftag != 'QUAL' and options.noheader == False: sys.stderr.write(options.maftag + " tag not in ##INFO headers!\n") exit(1) if options.vtag not in infoids and options.vtag != 'QUAL' and options.noheader==False: sys.stderr.write(options.vtag + " tag not in ##INFO headers!\n") exit(1) #vcfh.seek(0) if options.noheader == False: vcfobj.parseHeaderLine(vcfh) if options.variantype==None: variantpattern=options.vtag+'=(\w+);' else: variantpattern=options.vtag+'=('+options.variantype+');' mafpattern=options.maftag+'=(0.\d+)' #print mafpattern, variantpattern for dataline in vcfobj.yieldVcfDataLine(vcfh): #print dataline fields=dataline.strip().split('\t') (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8] #if filtercode != options.filter and options.filter != None : continue if re.search(variantpattern, info ) == None: #sys.stderr.write("no variant pattern\n") continue variant_type=re.search(variantpattern, info ).groups()[0] if re.search(mafpattern, info ) == None: #sys.stderr.write("No mafpattern!\n") #sys.stderr.write(dataline+"\n") continue maf_value=re.search(mafpattern, info ).groups()[0] if float(maf_value) <= options.leq and float(maf_value) >= options.geq: if options.quiet == False: print dataline logstring="\t".join([chrom,pos,id,ref,alt,variant_type, options.maftag, maf_value]) freqfh.write(logstring+'\n')
def main(): usage = "usage: %prog [options] file.vcf\n print records belonging to a certain type of variant class (e.g. SNP) in a VCF file\n\n" parser = OptionParser(usage) parser.add_option( "--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE", ) parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None) parser.add_option( "--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None ) parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file has no header file") (options, args) = parser.parse_args() if options.infotag == "": sys.stderr.write("provide a value for --info parameter!\n") exit(1) if options.variantype == "": sys.stderr.write("provide a value of --type parameter!\n") exit(1) variant_dict = {} vcfilename = args[0] vcfh = open(vcfilename, "r") # instantiate a VcfFile object vcfobj = VcfFile(vcfilename) # parse its metainfo lines (ones that begin with ##) vcfobj.parseMetaAndHeaderLines(vcfh) vcfobj.printMetaAndHeaderLines() descriptors = vcfobj.getMetaInfoDescription() infoids = [] for (tag, description) in descriptors: infoids.append(tag) if options.infotag not in infoids and options.infotag != "QUAL": sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n") exit(1) if options.variantype != None: pattern = options.infotag + "=(" + options.variantype + ")" for dataline in vcfobj.yieldVcfDataLine(vcfh): fields = dataline.strip().split("\t") (chrom, pos, id, ref, alt, qual, filtercode, info) = fields[0:8] if filtercode != options.filter and options.filter != None: continue if options.variantype != None: if re.search(pattern, info) == None: continue else: value = re.search(pattern, info).groups()[0] print dataline else: print dataline