def run(args): scoreNames,bedRegions = sniffBed(args.bedfile) if args.names != None: scoreNames.difference_update(args.names) regionsToRemove = set() for i,r in bedRegions: if r.name not in scoreNames: regionsToRemove.add(i) for i in regionsToRemove: del bedRegions[i] vcffile = open(args.infile,'r') outfile = open(args.outfile,'w') takenTags = set() for line in vcffile: if len(line) <= 1: continue elif line.startswith('##'): if line.startswith("##INFO"): newTag = line[line.find("ID=")+3:] newTag = newTag[:newTag.find(',')] takenTags.add(newTag) outfile.write(line) elif line.startswith('#'): for n in scoreNames: dupCount = 2 newTag = n while newTag in takenTags: newTag = n + str(dupCount) dupCount += 1 takenTags.add(newTag) outfile.write("##INFO=<ID=%s,Number=.,Type=Float,Description=\"User column added with addBEDtoVCF.py\">\n" % newTag) outfile.write(line) else: line = vcfLine(line.strip().split('\t')) line.extractChrAndPos() for b in bedRegions: if b.contains(line.chromosome, line.position): line.extractInfo() line.info[b.name] = str(b.score) outfile.write(str(line)) vcffile.close() outfile.close()
def run(args): print 'Counting values...' max_strings = args.max_strings ignoreStringCounts = False if max_strings <= 0: ignoreStringCounts = True max_strings = MAX_INFO_STRINGS infoFields = extractInfoFields(args.infile, max_strings) validFields = set() for k,f in infoFields.iteritems(): if not ignoreStringCounts and f.maxedOut: continue if args.preserve_info != None and k not in args.preserve_info: continue if args.remove_info != None and k in args.remove_info: continue validFields.add(k) print 'Writing file...' outfile = open(args.outfile, 'w') infile = open(args.infile, 'r') for line in infile: line = line if len(line) <= 1: continue elif line.startswith("##INFO"): newTag = line[line.find("ID=")+3:] newTag = newTag[:newTag.find(',')] if not infoFields.has_key(newTag): raise Exception("Second pass lost info tag:\t%s" % newTag) if newTag in validFields: outfile.write(line) elif line.startswith("#"): outfile.write(line) else: line = vcfLine(line.strip().split('\t')) line.extractInfo() keys = line.info.keys() for k in keys: if k not in validFields: del line.info[k] outfile.write(str(line)) infile.close() outfile.close()
def extractInfoFields(path,max_strings=MAX_INFO_STRINGS,tickFunction=None,numTicks=1000): infoFields = {} tickInterval = os.path.getsize(path)/numTicks nextTick = 0 if path.endswith('gz'): infile = gzip.open(path,'rb') else: infile = open(path,'rb') for line in infile: if tickFunction != None: if infile.tell() > nextTick: if not tickFunction(): infile.close() return None nextTick += tickInterval line = line.strip() if len(line) <= 1: continue elif line.startswith("##INFO"): newTag = line[line.find("ID=")+3:] newTag = newTag[:newTag.find(',')] if infoFields.has_key(newTag): raise Exception("Duplicate INFO ID or use of reserved ID:\t%s" % newTag) infoFields[newTag] = infoDetails(newTag, max_strings, False) elif line.startswith("#"): continue else: line = vcfLine(line.split('\t')) line.extractInfo() for k,v in line.info.iteritems(): if not infoFields.has_key(k): raise Exception("Missing ##INFO pragma for: %s" % k) else: infoFields[k].addArbitraryValue(v) infile.close() return infoFields
def __init__(self, line): if len(line.strip()) == 0: self.type = vcfKey.EMPTY elif line.startswith('##'): line = line.lower() if line.startswith('##fileformat'): self.type = vcfKey.FIRSTLINE elif line.startswith('##contig'): self.type = vcfKey.CONTIG contigID = line[line.find('ID=') + 3:] contigID = contigID[:contigID.find(',')] self.chromosome = standardizeChromosome(contigID) else: self.type = vcfKey.OTHER_META self.line = line elif line.startswith('#'): self.type = vcfKey.HEADER else: self.type = vcfKey.REGULAR temp = vcfLine(line.strip().split('\t')) temp.extractChrAndPos() self.chromosome = temp.chromosome self.position = temp.position
def __init__(self, line): if len(line.strip()) == 0: self.type = vcfKey.EMPTY elif line.startswith("##"): line = line.lower() if line.startswith("##fileformat"): self.type = vcfKey.FIRSTLINE elif line.startswith("##contig"): self.type = vcfKey.CONTIG contigID = line[line.find("ID=") + 3 :] contigID = contigID[: contigID.find(",")] self.chromosome = standardizeChromosome(contigID) else: self.type = vcfKey.OTHER_META self.line = line elif line.startswith("#"): self.type = vcfKey.HEADER else: self.type = vcfKey.REGULAR temp = vcfLine(line.strip().split("\t")) temp.extractChrAndPos() self.chromosome = temp.chromosome self.position = temp.position
def run(args): separateInfoFields = args.separate_info_fields.strip().lower() == "true" countSeparate = args.count_separate.strip().lower() == "true" ignoreFields = args.ignore_fields if ignoreFields == None: ignoreFields = [] ignoreFields = set(ignoreFields) posLengthWarned = False allChrs = [] positions = [] alleleColumn = infoDetails("Ref/Alt", 1, False) qualColumn = infoDetails("QUAL", 1, False) filterColumn = infoDetails("FILTER", args.max_strings, False) infoFields = {"Ref/Alt":alleleColumn,"QUAL":qualColumn,"FILTER":filterColumn} # TODO: get the numeric ranges, all valid categorical values infile = open(args.infile,'r') for line in infile: line = line.strip() if len(line) <= 1: continue elif line.startswith("#"): temp = line.lower() if temp.startswith("##info"): newTag = line[temp.find("id=")+3:] newTag = newTag[:newTag.find(',')] if infoFields.has_key(newTag): raise Exception("Duplicate INFO ID or use of reserved ID:\t%s" % newTag) infoFields[newTag] = infoDetails(newTag, args.max_strings, countSeparate) if newTag in ignoreFields: infoFields[newTag].maxedOut = True elif temp.startswith("##filter"): newTag = line[temp.find("id=")+3:] newTag = newTag[:newTag.find(',')] filterColumn.addCategory(newTag, 0) elif temp.startswith("##contig"): chrom = line[temp.find("id=")+3:] chrom = newTag[:chrom.find(',')] chrom = standardizeChromosome(chrom) chrLength = line[temp.find("length=")+3:] chrLength = chrLength[:chrLength.find(',')] allChrs.append(chrom) positions.append((0,int(chrLength))) else: # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields, # other strings will make this column max out early filterColumn.maxCategories = len(filterColumn.categories[0]) else: line = vcfLine(line.split('\t')) line.extractChrAndPos() if not line.chromosome not in allChrs: allChrs.append(line.chromosome) positions.append((0,0)) chrIndex = allChrs.index(line.chromosome) if line.position > positions[chrIndex][1]: positions[chrIndex] = (0,line.position) if not posLengthWarned: sys.stderr.write('WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.') sys.stderr.write(' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.') posLengthWarned = True line.extractAlleles() alleles = line.alleles if not separateInfoFields: alleles = ",".join(alleles) alleleColumn.addArbitraryValue(line.alleles) line.extractQual() qualColumn.addArbitraryValue(line.qual) line.extractFilters() filters = line.filters if not separateInfoFields: filters = ",".join(filters) filterColumn.addArbitraryValue(filters) line.extractInfo() for k,v in line.info.iteritems(): if not infoFields.has_key(k): raise Exception("Missing ##INFO pragma for: %s" % k) if separateInfoFields: v = ",".split(v) infoFields[k].addArbitraryValue(v) infile.close() print "Creating file..." outfile = open(args.outfile, 'w') outfile.write("##\t%s created from %s on %s\n" % (args.outfile,args.infile,str(datetime.datetime.now()))) outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs))) outfile.write("#\tPosition\tPOS\t%s\n" % ("\t".join(["(%i,%i)" % p for p in positions]))) outfile.write("#\tID\tID\n") headers = [] fieldOrder = sorted(infoFields.iterkeys()) for f in fieldOrder: pragmas = infoFields[f].getPragmas() for p in pragmas: outfile.write(p + "\n") h = p.split("\t")[1] headers.append(h) outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers))) infile = open(args.infile,'r') for line in infile: line = line.strip() if len(line) <= 1 or line.startswith("#"): continue line = vcfLine(line.split('\t')) line.extractChrAndPos() line.extractInfo() line.extractAlleles() line.info["Ref/Alt"] = line.alleles line.extractQual() line.info["QUAL"] = str(line.qual) line.extractFilters() line.info["FILTER"] = line.filters outfile.write("%s\t%i\t%s" % (line.chromosome,line.position,line.name)) for f in fieldOrder: values = line.info[f] if isinstance(values,list): if separateInfoFields: values = "\t".join(values) else: values = ",".join(values) outfile.write("\t%s" % values) outfile.write("\n") infile.close() outfile.close()
def run(args): infile = open(args.infile, "r") outfile = open(args.outfile, "w") failfile = None if args.failfile != "": failfile = open(args.failfile, "w") errfile = None if args.errfile != "": errfile = open(args.errfile, "w") if args.expression != "": tempfile = open(args.expression, "r") expression = tempfile.readline() tempfile.close() else: expression = "True" columns = [] if args.columns != "": tempfile = open(args.columns, "r") for line in tempfile: line = line.strip() columns.append(line) tempfile.close() bedRegions = None if args.bed != "": bedRegions = [] tempfile = open(args.bed, "r") for line in tempfile: bedRegions.append(bedLine(line.split())) tempfile.close() for line in infile: if len(line) <= 1: continue elif line.startswith("#"): outfile.write(line) if failfile != None: failfile.write(line) if errfile != None: errfile.write(line) continue else: line = vcfLine(line.strip().split("\t")) expArgs = [] for c in columns: if c == "CHROM": line.extractChrAndPos() expArgs.append(line.chromosome) elif c == "POS": line.extractChrAndPos() expArgs.append(line.position) elif c == "ID": line.extractChrAndPos() expArgs.append(line.name) elif c == "QUAL": line.extractQual() expArgs.append(line.qual) elif c == "FILTER": line.extractFilters() expArgs.append(line.filters) else: line.extractInfo() expArgs.append(line.info.get(c, ".")) # first see if it fails the .bed regions if bedRegions != None: passedBed = False line.extractChrAndPos() for bed in bedRegions: if bed.contains(line.chromosome, line.position): passedBed = True break if not passedBed: if failfile != None: failfile.write(str(line)) continue exp = expression % tuple(expArgs) try: result = eval(exp) if result == True: outfile.write(str(line)) elif result == False: if failfile != None: failfile.write(str(line)) else: if errfile != None: errfile.write(str(line)) except: if errfile != None: errfile.write(str(line)) infile.close() outfile.close() if errfile != None: errfile.close()
def run(args): delimiter,headers,chromColumn,posColumn,idColumn = sniffCsv(args.csvfile) if args.exact == None and args.nearest == None and args.interpolate == None: temp = headers temp.remove('CHROM') temp.remove('POS') args.exact = temp exact = {} nearest = {} interpolate = {} vcffile = open(args.infile,'r') csvfile = open(args.csvfile,'r') outfile = open(args.outfile,'w') csvbasename = os.path.split(args.csvfile)[1] takenTags = set() vcfHeaderLine = "" while True: line = vcffile.readline() if len(line) <= 1: continue elif line.startswith("##"): if line.startswith("##INFO"): newTag = line[line.find("ID=")+3:] newTag = newTag[:newTag.find(',')] takenTags.add(newTag) outfile.write(line) elif line.startswith("#"): vcfHeaderLine = line break else: raise Exception("Missing a header line or something else is wrong...") if args.exact != None: for x in args.exact: if not x in headers: raise Exception('Column header "%s" doesn\'t exist in %s' % (x,csvbasename)) temp = x dupNumber = 2 while x in takenTags: x = temp + str(dupNumber) dupNumber += 1 takenTags.add(x) outfile.write('##INFO=<ID=%s,Number=.,Type=String,Description=\"addCSVtoVCF.py: Column %s from %s in --exact match mode\">\n' % (x,temp,csvbasename)) exact[x] = headers.index(x) if args.nearest != None: for x in args.nearest: if not x in headers: raise Exception('Column header "%s" doesn\'t exist in %s' % (x,args.csvfile)) temp = x dupNumber = 2 while x in takenTags: x = temp + str(dupNumber) dupNumber += 1 takenTags.add(x) outfile.write('##INFO=<ID=%s,Number=.,Type=String,Description=\"addCSVtoVCF.py: Column %s from %s in --nearest match mode\">\n' % (x,temp,csvbasename)) nearest[x] = headers.index(x) if args.interpolate != None: for x in args.interpolate: if not x in headers: raise Exception('Column header "%s" doesn\'t exist in %s' % (x,args.csvfile)) temp = x dupNumber = 2 while x in takenTags: x = temp + str(dupNumber) dupNumber += 1 takenTags.add(x) outfile.write('##INFO=<ID=%s,Number=.,Type=String,Description=\"addCSVtoVCF.py: Column %s from %s in --interpolate match mode\">\n' % (x,temp,csvbasename)) interpolate[x] = headers.index(x) outfile.write(vcfHeaderLine) # grab our first lines vLine = vcffile.readline() vLine = vcfLine(vLine.strip().split('\t')) vLine.extractChrAndPos() vLine.extractInfo() lastCline = None csvfile.readline() # skip the header cLine = csvLine(csvfile.readline().strip().split(delimiter),chromColumn,posColumn,idColumn) while True: speedAhead = cLine != None # a flag that lets us just spit out .vcf lines because we know that either the .csv file has finished or there are no new .csv lines on the same chromosome # ... Are we even on the same chromosome? while speedAhead and cLine.chrom != vLine.chromosome: # the .csv file is ahead by a whole chromosome at least... keep lastCline intact and just spew out .vcf lines until it catches up if chromosomeOrder.index(cLine.chrom) > chromosomeOrder.index(vLine.chromosome): speedAhead = False break else: # okay, the .csv file is behind the .vcf file by at least a chromsome... speed ahead until we catch up or run out of .csv data lastCline = cLine cLine = csvfile.readline() if not cLine: # shoot... we're out of .csv data. We already know that lastCline wasn't on the same chromosome as the current vLine, so make it None as well cLine = None lastCline = None speedAhead = False break else: cLine = csvLine(cLine.strip().split(delimiter),chromColumn,posColumn,idColumn) # Okay, now we're on the same chromosome... zip ahead until cLine and lastCline are straddling vLine while speedAhead and cLine.pos < vLine.position: lastCline = cLine cLine = csvfile.readline() if not cLine: # shoot... out of .csv data. We know lastCline is still on the same chromosome, so preserve that, but make cLine None so we know nothing is left cLine = None break else: cLine = csvLine(cLine.strip().split(delimiter),chromColumn,posColumn,idColumn) # Whew! We're finally straddling the vLine... # Check the super-special case first (exact match) if cLine != None and cLine.pos == vLine.position: for x,i in exact.iteritems(): vLine.info[x] = cLine.columns[i] for x,i in nearest.iteritems(): vLine.info[x] = cLine.columns[i] for x,i in interpolate.iteritems(): vLine.info[x] = cLine.columns[i] elif cLine != None: # cLine.pos will be > vLine.position if not args.omit_mismatches: for x,i in exact.iteritems(): vLine.info[x] = "." if lastCline == None: for x,i in nearest.iteritems(): vLine.info[x] = cLine.columns[i] for x,i in interpolate.iteritems(): vLine.info[x] = cLine.columns[i] else: closestLine = lastCline if vLine.position - lastCline.pos <= cLine.pos - vLine.position else cLine for x,i in nearest.iteritems(): vLine.info[x] = closestLine.columns[i] for x,i in interpolate.iteritems(): try: lastVal = float(lastCline.columns[i]) nextVal = float(cLine.columns[i]) vLine.info[x] = str(lastVal + (nextVal - lastVal)*(vLine.position - lastCline.pos)/(cLine.pos - lastCline.pos)) except ValueError: vLine.info[x] = closestLine.columns[i] else: # cLine == None if lastCline == None: if not args.omit_mismatches: for x,i in exact.iteritems(): vLine.info[x] = "." for x,i in nearest.iteritems(): vLine.info[x] = "." for x,i in interpolate.iteritems(): vLine.info[x] = "." else: if not args.omit_mismatches: for x,i in exact.iteritems(): vLine.info[x] = "." for x,i in nearest.iteritems(): vLine.info[x] = lastCline.columns[i] for x,i in interpolate.iteritems(): vLine.info[x] = lastCline.columns[i] # Okay, we've copied over everything; write the line outfile.write(str(vLine)) # Grab the next one vLine = vcffile.readline() if not vLine: break # No more variants - we're done! vLine = vcfLine(vLine.strip().split('\t')) vLine.extractChrAndPos() vLine.extractInfo() csvfile.close() vcffile.close() outfile.close()
for line in infile: if len(line) <= 1: # Skip blank lines continue elif line.startswith('#'): # Skip header pragma lines if line.lower().startswith('#chrom'): # Get all the individual IDs from the header, write it to the file individualIDs = line.strip().split('\t')[9:] outfile.write(','.join(individualIDs)) outfile.write('\n') else: assert individualIDs != None # In a well-formed .vcf file, we'll have run across the header line before any data # Here is where my library comes in - we first build a vcfLine object with the columns in the .vcf file line = vcfLine(line.strip().split('\t')) # Ideally, these two steps would be performed automatically, but sometimes we might want # to skip them to save time in practice: # We have to first extract the alleles before I can reference line.alleles line.extractAlleles() # We have to first extract all genotypes before I can reference line.genotypes line.extractGenotypes() for i in xrange(len(individualIDs)): allele1,allele2,phased,attributes = line.genotypes[i] outfile.write(line.alleles[allele1]) # write the first allele letters - to write the number, just write allele1 outfile.write("|" if phased else "/") outfile.write(line.alleles[allele2]) # write the second allele letters - to write the number, just write allele2
for line in infile: if len(line) <= 1: # Skip blank lines continue elif line.startswith('#'): # Skip header pragma lines if line.lower().startswith('#chrom'): # Get all the individual IDs from the header, write it to the file individualIDs = line.strip().split('\t')[9:] outfile.write(','.join(individualIDs)) outfile.write('\n') else: assert individualIDs != None # In a well-formed .vcf file, we'll have run across the header line before any data # Here is where my library comes in - we first build a vcfLine object with the columns in the .vcf file line = vcfLine(line.strip().split('\t')) # Ideally, these two steps would be performed automatically, but sometimes we might want # to skip them to save time in practice: # We have to first extract the alleles before I can reference line.alleles line.extractAlleles() # We have to first extract all genotypes before I can reference line.genotypes line.extractGenotypes() for i in xrange(len(individualIDs)): allele1, allele2, phased, attributes = line.genotypes[i] outfile.write( line.alleles[allele1] ) # write the first allele letters - to write the number, just write allele1
def run(args): separateInfoFields = args.separate_info_fields.strip().lower() == "true" numberAlleles = args.numbered_alleles.strip().lower() == "true" includeGenotypes = args.include_genotypes.strip().lower() == "true" includeGenotypeAttributes = args.include_genotype_attributes.strip().lower() == "true" if includeGenotypeAttributes: includeGenotypes = True ignoreFields = args.ignore_fields if ignoreFields == None: ignoreFields = set([]) ignoreFields = set(ignoreFields) numAltAlleles = 1 numFilters = 1 infoOrder = [] infoHeaders = {} peopleOrder = [] formatOrder = [] formatHeaders = {} infile = open(args.infile,'r') for line in infile: line = line.strip() if len(line) <= 1: continue elif line.startswith("#"): temp = line.lower() if temp.startswith("##info"): newTag = line[temp.find("id=")+3:] newTag = newTag[:newTag.find(',')] if not newTag in ignoreFields: if not infoHeaders.has_key(newTag): infoOrder.append(newTag) infoHeaders[newTag] = 1 elif temp.startswith("##format"): newTag = line[temp.find("id=")+3:] newTag = newTag[:newTag.find(',')] if not newTag in ignoreFields and not newTag == 'GT': if not formatHeaders.has_key(newTag): formatOrder.append(newTag) formatHeaders[newTag] = 1 elif temp.startswith("#chrom"): peopleOrder = line.split('\t')[9:] else: line = vcfLine(line.split('\t')) if separateInfoFields: line.extractAlleles() numAltAlleles = max(numAltAlleles,len(line.alleles)-1) # don't include the REF allele line.extractFilters() numFilters = max(numFilters,len(line.filters)) line.extractInfo() for k,v in line.info.iteritems(): if not infoHeaders.has_key(k): infoOrder.append(newTag) infoHeaders[k] = 1 if separateInfoFields and isinstance(v,list): infoHeaders[k] = max(infoHeaders[k],len(v)) if includeGenotypeAttributes: line.extractFormat() line.extractGenotypes() for i,p in enumerate(peopleOrder): allele0,allele1,phased,attrs = line.genotypes[i] # @UnusedVariable for j,f in enumerate(line.format[1:]): if len(attrs) > j: formatHeaders[f] = max(formatHeaders[f],len(attrs[j].split(','))) infile.close() print "Creating file..." outfile = open(args.outfile, 'w') outfile.write('Chromosome\tPosition\tID\tReference_Allele') if separateInfoFields and numAltAlleles > 1: for x in xrange(numAltAlleles): outfile.write('\tAlternate_Allele_%i' % (x+1)) else: outfile.write('\tAlternate_Allele') outfile.write('\tQual') if separateInfoFields and numFilters > 1: for x in xrange(numFilters): outfile.write('\tFilter_%i' % (x+1)) else: outfile.write('\tFilter') for i in infoOrder: if separateInfoFields and infoHeaders[i] > 1: for x in xrange(infoHeaders[i]): outfile.write('\t%s_%i' % (i,x+1)) else: outfile.write('\t%s' % i) if includeGenotypes: for p in peopleOrder: outfile.write('\t%s_Allele_1' % p) outfile.write('\t%s_Allele_2' % p) if includeGenotypeAttributes: outfile.write('\t%s_Phased' % p) for f in formatOrder: if separateInfoFields and formatHeaders[f] > 1: for x in xrange(formatHeaders[f]): outfile.write('\t%s_%s_%i' % (p,f,x+1)) else: outfile.write('\t%s_%s' % (p,f)) outfile.write('\n') infile = open(args.infile,'r') for line in infile: line = line.strip() if len(line) <= 1 or line.startswith("#"): continue line = vcfLine(line.split('\t')) line.extractChrAndPos() outfile.write("%s\t%i\t%s" % (line.chromosome,line.position,line.name)) line.extractAlleles() outfile.write('\t%s' % line.alleles[0]) if separateInfoFields: for a in line.alleles[1:]: outfile.write('\t%s' % a) x = len(line.alleles)-1 while x < numAltAlleles: outfile.write('\t') x += 1 else: outfile.write('\t%s' % ','.join(line.alleles[1:])) line.extractQual() outfile.write("\t%f" % line.qual) line.extractFilters() if separateInfoFields: for f in line.filters: outfile.write('\t%s' % f) x = len(line.filters) while x < numFilters: outfile.write('\t') x += 1 else: outfile.write('\t%s' % ','.join(line.filters)) line.extractInfo() for i in infoOrder: if not line.info.has_key(i): if separateInfoFields: for x in xrange(infoHeaders[i]): outfile.write('\t') else: outfile.write('\t') else: values = line.info[i] if not isinstance(values,list): values = [values] for j,v in enumerate(values): if v == None: values[j] = i if separateInfoFields: for v in values: outfile.write('\t%s' % v) x = len(values) while x < infoHeaders[i]: outfile.write('\t') x += 1 else: outfile.write('\t%s' % ','.join(values)) if includeGenotypes: line.extractFormat() line.extractGenotypes() for i,p in enumerate(peopleOrder): allele0,allele1,phased,attrs = line.genotypes[i] if allele0 == None: allele0 = '.' elif not numberAlleles: allele0 = line.alleles[allele0] if allele1 == None: allele1 = '.' elif not numberAlleles: allele1 = line.alleles[allele1] outfile.write('\t%s\t%s' % (allele0,allele1)) if includeGenotypeAttributes: outfile.write('\t%s' % ('Y' if phased else 'N')) for j,f in enumerate(formatOrder): if not f in line.format: if separateInfoFields: for x in xrange(formatHeaders[f]): outfile.write('\t') else: outfile.write('\t') else: attrIndex = line.format.index(f)-1 if attrIndex >= len(attrs): if separateInfoFields: for x in xrange(formatHeaders[f]): outfile.write('\t') else: outfile.write('\t') else: values = attrs[attrIndex].split(',') if separateInfoFields: for v in values: outfile.write('\t%s' % v) x = len(values) while x < formatHeaders[f]: outfile.write('\t') x += 1 else: outfile.write('\t%s' % ','.join(values)) outfile.write("\n") infile.close() outfile.close()
def run(args): separateInfoFields = args.separate_info_fields.strip().lower() == "true" countSeparate = args.count_separate.strip().lower() == "true" ignoreFields = args.ignore_fields if ignoreFields == None: ignoreFields = [] ignoreFields = set(ignoreFields) posLengthWarned = False allChrs = [] positions = [] alleleColumn = infoDetails("Ref/Alt", 1, False) qualColumn = infoDetails("QUAL", 1, False) filterColumn = infoDetails("FILTER", args.max_strings, False) infoFields = { "Ref/Alt": alleleColumn, "QUAL": qualColumn, "FILTER": filterColumn } # TODO: get the numeric ranges, all valid categorical values infile = open(args.infile, 'r') for line in infile: line = line.strip() if len(line) <= 1: continue elif line.startswith("#"): temp = line.lower() if temp.startswith("##info"): newTag = line[temp.find("id=") + 3:] newTag = newTag[:newTag.find(',')] if infoFields.has_key(newTag): raise Exception( "Duplicate INFO ID or use of reserved ID:\t%s" % newTag) infoFields[newTag] = infoDetails(newTag, args.max_strings, countSeparate) if newTag in ignoreFields: infoFields[newTag].maxedOut = True elif temp.startswith("##filter"): newTag = line[temp.find("id=") + 3:] newTag = newTag[:newTag.find(',')] filterColumn.addCategory(newTag, 0) elif temp.startswith("##contig"): chrom = line[temp.find("id=") + 3:] chrom = newTag[:chrom.find(',')] chrom = standardizeChromosome(chrom) chrLength = line[temp.find("length=") + 3:] chrLength = chrLength[:chrLength.find(',')] allChrs.append(chrom) positions.append((0, int(chrLength))) else: # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields, # other strings will make this column max out early filterColumn.maxCategories = len(filterColumn.categories[0]) else: line = vcfLine(line.split('\t')) line.extractChrAndPos() if not line.chromosome not in allChrs: allChrs.append(line.chromosome) positions.append((0, 0)) chrIndex = allChrs.index(line.chromosome) if line.position > positions[chrIndex][1]: positions[chrIndex] = (0, line.position) if not posLengthWarned: sys.stderr.write( 'WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.' ) sys.stderr.write( ' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.' ) posLengthWarned = True line.extractAlleles() alleles = line.alleles if not separateInfoFields: alleles = ",".join(alleles) alleleColumn.addArbitraryValue(line.alleles) line.extractQual() qualColumn.addArbitraryValue(line.qual) line.extractFilters() filters = line.filters if not separateInfoFields: filters = ",".join(filters) filterColumn.addArbitraryValue(filters) line.extractInfo() for k, v in line.info.iteritems(): if not infoFields.has_key(k): raise Exception("Missing ##INFO pragma for: %s" % k) if separateInfoFields: v = ",".split(v) infoFields[k].addArbitraryValue(v) infile.close() print "Creating file..." outfile = open(args.outfile, 'w') outfile.write("##\t%s created from %s on %s\n" % (args.outfile, args.infile, str(datetime.datetime.now()))) outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs))) outfile.write("#\tPosition\tPOS\t%s\n" % ("\t".join(["(%i,%i)" % p for p in positions]))) outfile.write("#\tID\tID\n") headers = [] fieldOrder = sorted(infoFields.iterkeys()) for f in fieldOrder: pragmas = infoFields[f].getPragmas() for p in pragmas: outfile.write(p + "\n") h = p.split("\t")[1] headers.append(h) outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers))) infile = open(args.infile, 'r') for line in infile: line = line.strip() if len(line) <= 1 or line.startswith("#"): continue line = vcfLine(line.split('\t')) line.extractChrAndPos() line.extractInfo() line.extractAlleles() line.info["Ref/Alt"] = line.alleles line.extractQual() line.info["QUAL"] = str(line.qual) line.extractFilters() line.info["FILTER"] = line.filters outfile.write("%s\t%i\t%s" % (line.chromosome, line.position, line.name)) for f in fieldOrder: values = line.info[f] if isinstance(values, list): if separateInfoFields: values = "\t".join(values) else: values = ",".join(values) outfile.write("\t%s" % values) outfile.write("\n") infile.close() outfile.close()