示例#1
0
def run(args):
    scoreNames,bedRegions = sniffBed(args.bedfile)
    
    if args.names != None:
        scoreNames.difference_update(args.names)
        regionsToRemove = set()
        for i,r in bedRegions:
            if r.name not in scoreNames:
                regionsToRemove.add(i)
        for i in regionsToRemove:
            del bedRegions[i]
    
    vcffile = open(args.infile,'r')
    outfile = open(args.outfile,'w')
    
    takenTags = set()
    
    for line in vcffile:
        if len(line) <= 1:
            continue
        elif line.startswith('##'):
            if line.startswith("##INFO"):
                newTag = line[line.find("ID=")+3:]
                newTag = newTag[:newTag.find(',')]
                takenTags.add(newTag)
            outfile.write(line)
        elif line.startswith('#'):
            for n in scoreNames:
                dupCount = 2
                newTag = n
                while newTag in takenTags:
                    newTag = n + str(dupCount)
                    dupCount += 1
                takenTags.add(newTag)
                outfile.write("##INFO=<ID=%s,Number=.,Type=Float,Description=\"User column added with addBEDtoVCF.py\">\n" % newTag)
            outfile.write(line)
        else:
            line = vcfLine(line.strip().split('\t'))
            line.extractChrAndPos()
            
            for b in bedRegions:
                if b.contains(line.chromosome, line.position):
                    line.extractInfo()
                    line.info[b.name] = str(b.score)
            
            outfile.write(str(line))
            
    vcffile.close()
    outfile.close()
def run(args):
    print 'Counting values...'
    max_strings = args.max_strings
    ignoreStringCounts = False
    if max_strings <= 0:
        ignoreStringCounts = True
        max_strings = MAX_INFO_STRINGS
    infoFields = extractInfoFields(args.infile, max_strings)
    
    validFields = set()
    for k,f in infoFields.iteritems():
        if not ignoreStringCounts and f.maxedOut:
            continue
        if args.preserve_info != None and k not in args.preserve_info:
            continue
        if args.remove_info != None and k in args.remove_info:
            continue
        validFields.add(k)
    
    print 'Writing file...'
    outfile = open(args.outfile, 'w')
    infile = open(args.infile, 'r')
    for line in infile:
        line = line
        if len(line) <= 1:
            continue
        elif line.startswith("##INFO"):
            newTag = line[line.find("ID=")+3:]
            newTag = newTag[:newTag.find(',')]
            if not infoFields.has_key(newTag):
                raise Exception("Second pass lost info tag:\t%s" % newTag)
            if newTag in validFields:
                outfile.write(line)
        elif line.startswith("#"):
            outfile.write(line)
        else:
            line = vcfLine(line.strip().split('\t'))
            line.extractInfo()
            keys = line.info.keys()
            for k in keys:
                if k not in validFields:
                    del line.info[k]
            outfile.write(str(line))
    infile.close()
    outfile.close()
def extractInfoFields(path,max_strings=MAX_INFO_STRINGS,tickFunction=None,numTicks=1000):
    infoFields = {}
    
    tickInterval = os.path.getsize(path)/numTicks
    nextTick = 0
    
    if path.endswith('gz'):
        infile = gzip.open(path,'rb')
    else:
        infile = open(path,'rb')
    for line in infile:
        if tickFunction != None:
            if infile.tell() > nextTick:
                if not tickFunction():
                    infile.close()
                    return None
                nextTick += tickInterval
        line = line.strip()
        if len(line) <= 1:
            continue
        elif line.startswith("##INFO"):
            newTag = line[line.find("ID=")+3:]
            newTag = newTag[:newTag.find(',')]
            if infoFields.has_key(newTag):
                raise Exception("Duplicate INFO ID or use of reserved ID:\t%s" % newTag)
            infoFields[newTag] = infoDetails(newTag, max_strings, False)
        elif line.startswith("#"):
            continue
        else:
            line = vcfLine(line.split('\t'))
            line.extractInfo()
            for k,v in line.info.iteritems():
                if not infoFields.has_key(k):
                    raise Exception("Missing ##INFO pragma for: %s" % k)
                else:
                    infoFields[k].addArbitraryValue(v)
    infile.close()
    return infoFields
示例#4
0
 def __init__(self, line):
     if len(line.strip()) == 0:
         self.type = vcfKey.EMPTY
     elif line.startswith('##'):
         line = line.lower()
         if line.startswith('##fileformat'):
             self.type = vcfKey.FIRSTLINE
         elif line.startswith('##contig'):
             self.type = vcfKey.CONTIG
             contigID = line[line.find('ID=') + 3:]
             contigID = contigID[:contigID.find(',')]
             self.chromosome = standardizeChromosome(contigID)
         else:
             self.type = vcfKey.OTHER_META
             self.line = line
     elif line.startswith('#'):
         self.type = vcfKey.HEADER
     else:
         self.type = vcfKey.REGULAR
         temp = vcfLine(line.strip().split('\t'))
         temp.extractChrAndPos()
         self.chromosome = temp.chromosome
         self.position = temp.position
示例#5
0
 def __init__(self, line):
     if len(line.strip()) == 0:
         self.type = vcfKey.EMPTY
     elif line.startswith("##"):
         line = line.lower()
         if line.startswith("##fileformat"):
             self.type = vcfKey.FIRSTLINE
         elif line.startswith("##contig"):
             self.type = vcfKey.CONTIG
             contigID = line[line.find("ID=") + 3 :]
             contigID = contigID[: contigID.find(",")]
             self.chromosome = standardizeChromosome(contigID)
         else:
             self.type = vcfKey.OTHER_META
             self.line = line
     elif line.startswith("#"):
         self.type = vcfKey.HEADER
     else:
         self.type = vcfKey.REGULAR
         temp = vcfLine(line.strip().split("\t"))
         temp.extractChrAndPos()
         self.chromosome = temp.chromosome
         self.position = temp.position
def run(args):
    
    separateInfoFields = args.separate_info_fields.strip().lower() == "true"
    countSeparate = args.count_separate.strip().lower() == "true"
    
    ignoreFields = args.ignore_fields
    if ignoreFields == None:
        ignoreFields = []
    ignoreFields = set(ignoreFields)
    
    posLengthWarned = False
    allChrs = []
    positions = []
    alleleColumn = infoDetails("Ref/Alt", 1, False)
    qualColumn = infoDetails("QUAL", 1, False)
    filterColumn = infoDetails("FILTER", args.max_strings, False)
    infoFields = {"Ref/Alt":alleleColumn,"QUAL":qualColumn,"FILTER":filterColumn}
    # TODO: get the numeric ranges, all valid categorical values
    
    infile = open(args.infile,'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1:
            continue
        elif line.startswith("#"):
            temp = line.lower()
            if temp.startswith("##info"):
                newTag = line[temp.find("id=")+3:]
                newTag = newTag[:newTag.find(',')]
                if infoFields.has_key(newTag):
                    raise Exception("Duplicate INFO ID or use of reserved ID:\t%s" % newTag)
                infoFields[newTag] = infoDetails(newTag, args.max_strings, countSeparate)
                if newTag in ignoreFields:
                    infoFields[newTag].maxedOut = True
            elif temp.startswith("##filter"):
                newTag = line[temp.find("id=")+3:]
                newTag = newTag[:newTag.find(',')]
                filterColumn.addCategory(newTag, 0)
            elif temp.startswith("##contig"):
                chrom = line[temp.find("id=")+3:]
                chrom = newTag[:chrom.find(',')]
                chrom = standardizeChromosome(chrom)
                chrLength = line[temp.find("length=")+3:]
                chrLength = chrLength[:chrLength.find(',')]
                
                allChrs.append(chrom)
                positions.append((0,int(chrLength)))
            else:
                # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields,
                # other strings will make this column max out early
                filterColumn.maxCategories = len(filterColumn.categories[0])
        else:
            line = vcfLine(line.split('\t'))
            line.extractChrAndPos()
            
            if not line.chromosome not in allChrs:
                allChrs.append(line.chromosome)
                positions.append((0,0))
            chrIndex = allChrs.index(line.chromosome)
            if line.position > positions[chrIndex][1]:
                positions[chrIndex] = (0,line.position)
                if not posLengthWarned:
                    sys.stderr.write('WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.')
                    sys.stderr.write(' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.')
                    posLengthWarned = True
            
            line.extractAlleles()
            alleles = line.alleles
            if not separateInfoFields:
                alleles = ",".join(alleles)
            alleleColumn.addArbitraryValue(line.alleles)
            
            line.extractQual()
            qualColumn.addArbitraryValue(line.qual)
            
            line.extractFilters()
            filters = line.filters
            if not separateInfoFields:
                filters = ",".join(filters)
            filterColumn.addArbitraryValue(filters)
            
            line.extractInfo()
            for k,v in line.info.iteritems():
                if not infoFields.has_key(k):
                    raise Exception("Missing ##INFO pragma for: %s" % k)
                if separateInfoFields:
                    v = ",".split(v)
                infoFields[k].addArbitraryValue(v)
    infile.close()
    
    print "Creating file..."
    outfile = open(args.outfile, 'w')
    
    outfile.write("##\t%s created from %s on %s\n" % (args.outfile,args.infile,str(datetime.datetime.now())))
    outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs)))
    outfile.write("#\tPosition\tPOS\t%s\n" % ("\t".join(["(%i,%i)" % p for p in positions])))
    outfile.write("#\tID\tID\n")
    
    headers = []
    fieldOrder = sorted(infoFields.iterkeys())
    for f in fieldOrder:
        pragmas = infoFields[f].getPragmas()
        for p in pragmas:
            outfile.write(p + "\n")
            h = p.split("\t")[1]
            headers.append(h)
    
    outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers)))
    
    infile = open(args.infile,'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1 or line.startswith("#"):
            continue
        line = vcfLine(line.split('\t'))
        line.extractChrAndPos()
        line.extractInfo()
        line.extractAlleles()
        line.info["Ref/Alt"] = line.alleles
        line.extractQual()
        line.info["QUAL"] = str(line.qual)
        line.extractFilters()
        line.info["FILTER"] = line.filters
        outfile.write("%s\t%i\t%s" % (line.chromosome,line.position,line.name))
        for f in fieldOrder:
            values = line.info[f]
            if isinstance(values,list):
                if separateInfoFields:
                    values = "\t".join(values)
                else:
                    values = ",".join(values)
            outfile.write("\t%s" % values)
        outfile.write("\n")
    infile.close()
    outfile.close()
def run(args):

    infile = open(args.infile, "r")
    outfile = open(args.outfile, "w")
    failfile = None
    if args.failfile != "":
        failfile = open(args.failfile, "w")
    errfile = None
    if args.errfile != "":
        errfile = open(args.errfile, "w")

    if args.expression != "":
        tempfile = open(args.expression, "r")
        expression = tempfile.readline()
        tempfile.close()
    else:
        expression = "True"

    columns = []
    if args.columns != "":
        tempfile = open(args.columns, "r")
        for line in tempfile:
            line = line.strip()
            columns.append(line)
        tempfile.close()

    bedRegions = None
    if args.bed != "":
        bedRegions = []
        tempfile = open(args.bed, "r")
        for line in tempfile:
            bedRegions.append(bedLine(line.split()))
        tempfile.close()

    for line in infile:
        if len(line) <= 1:
            continue
        elif line.startswith("#"):
            outfile.write(line)
            if failfile != None:
                failfile.write(line)
            if errfile != None:
                errfile.write(line)
            continue
        else:
            line = vcfLine(line.strip().split("\t"))
            expArgs = []
            for c in columns:
                if c == "CHROM":
                    line.extractChrAndPos()
                    expArgs.append(line.chromosome)
                elif c == "POS":
                    line.extractChrAndPos()
                    expArgs.append(line.position)
                elif c == "ID":
                    line.extractChrAndPos()
                    expArgs.append(line.name)
                elif c == "QUAL":
                    line.extractQual()
                    expArgs.append(line.qual)
                elif c == "FILTER":
                    line.extractFilters()
                    expArgs.append(line.filters)
                else:
                    line.extractInfo()
                    expArgs.append(line.info.get(c, "."))

            # first see if it fails the .bed regions
            if bedRegions != None:
                passedBed = False
                line.extractChrAndPos()
                for bed in bedRegions:
                    if bed.contains(line.chromosome, line.position):
                        passedBed = True
                        break
                if not passedBed:
                    if failfile != None:
                        failfile.write(str(line))
                    continue

            exp = expression % tuple(expArgs)
            try:
                result = eval(exp)
                if result == True:
                    outfile.write(str(line))
                elif result == False:
                    if failfile != None:
                        failfile.write(str(line))
                else:
                    if errfile != None:
                        errfile.write(str(line))
            except:
                if errfile != None:
                    errfile.write(str(line))

    infile.close()
    outfile.close()
    if errfile != None:
        errfile.close()
示例#8
0
def run(args):
    delimiter,headers,chromColumn,posColumn,idColumn = sniffCsv(args.csvfile)
    if args.exact == None and args.nearest == None and args.interpolate == None:
        temp = headers
        temp.remove('CHROM')
        temp.remove('POS')
        args.exact = temp
    
    exact = {}
    nearest = {}
    interpolate = {}
    
    vcffile = open(args.infile,'r')
    csvfile = open(args.csvfile,'r')
    outfile = open(args.outfile,'w')
    
    csvbasename = os.path.split(args.csvfile)[1]
    
    takenTags = set()
    vcfHeaderLine = ""
    
    while True:
        line = vcffile.readline()
        if len(line) <= 1:
            continue
        elif line.startswith("##"):
            if line.startswith("##INFO"):
                newTag = line[line.find("ID=")+3:]
                newTag = newTag[:newTag.find(',')]
                takenTags.add(newTag)
            outfile.write(line)
        elif line.startswith("#"):
            vcfHeaderLine = line
            break
        else:
            raise Exception("Missing a header line or something else is wrong...")
    
    if args.exact != None:
        for x in args.exact:
            if not x in headers:
                raise Exception('Column header "%s" doesn\'t exist in %s' % (x,csvbasename))
            temp = x
            dupNumber = 2
            while x in takenTags:
                x = temp + str(dupNumber)
                dupNumber += 1
            takenTags.add(x)
            outfile.write('##INFO=<ID=%s,Number=.,Type=String,Description=\"addCSVtoVCF.py: Column %s from %s in --exact match mode\">\n' % (x,temp,csvbasename))
            exact[x] = headers.index(x)
    
    if args.nearest != None:
        for x in args.nearest:
            if not x in headers:
                raise Exception('Column header "%s" doesn\'t exist in %s' % (x,args.csvfile))
            temp = x
            dupNumber = 2
            while x in takenTags:
                x = temp + str(dupNumber)
                dupNumber += 1
            takenTags.add(x)
            outfile.write('##INFO=<ID=%s,Number=.,Type=String,Description=\"addCSVtoVCF.py: Column %s from %s in --nearest match mode\">\n' % (x,temp,csvbasename))
            nearest[x] = headers.index(x)
    
    if args.interpolate != None:
        for x in args.interpolate:
            if not x in headers:
                raise Exception('Column header "%s" doesn\'t exist in %s' % (x,args.csvfile))
            temp = x
            dupNumber = 2
            while x in takenTags:
                x = temp + str(dupNumber)
                dupNumber += 1
            takenTags.add(x)
            outfile.write('##INFO=<ID=%s,Number=.,Type=String,Description=\"addCSVtoVCF.py: Column %s from %s in --interpolate match mode\">\n' % (x,temp,csvbasename))
            interpolate[x] = headers.index(x)
    
    outfile.write(vcfHeaderLine)
    
    # grab our first lines  
    vLine = vcffile.readline()
    vLine = vcfLine(vLine.strip().split('\t'))
    vLine.extractChrAndPos()
    vLine.extractInfo()
    
    lastCline = None
    csvfile.readline()  # skip the header
    cLine = csvLine(csvfile.readline().strip().split(delimiter),chromColumn,posColumn,idColumn)
    
    while True:
        speedAhead = cLine != None  # a flag that lets us just spit out .vcf lines because we know that either the .csv file has finished or there are no new .csv lines on the same chromosome
        # ... Are we even on the same chromosome?
        while speedAhead and cLine.chrom != vLine.chromosome:
            # the .csv file is ahead by a whole chromosome at least... keep lastCline intact and just spew out .vcf lines until it catches up
            if chromosomeOrder.index(cLine.chrom) > chromosomeOrder.index(vLine.chromosome):
                speedAhead = False
                break
            else:
                # okay, the .csv file is behind the .vcf file by at least a chromsome... speed ahead until we catch up or run out of .csv data
                lastCline = cLine
                cLine = csvfile.readline()
                if not cLine:
                    # shoot... we're out of .csv data. We already know that lastCline wasn't on the same chromosome as the current vLine, so make it None as well
                    cLine = None
                    lastCline = None
                    speedAhead = False
                    break
                else:
                    cLine = csvLine(cLine.strip().split(delimiter),chromColumn,posColumn,idColumn)
        # Okay, now we're on the same chromosome... zip ahead until cLine and lastCline are straddling vLine
        while speedAhead and cLine.pos < vLine.position:
            lastCline = cLine
            cLine = csvfile.readline()
            if not cLine:
                # shoot... out of .csv data. We know lastCline is still on the same chromosome, so preserve that, but make cLine None so we know nothing is left
                cLine = None
                break
            else:
                cLine = csvLine(cLine.strip().split(delimiter),chromColumn,posColumn,idColumn)
        
        # Whew! We're finally straddling the vLine...
        
        # Check the super-special case first (exact match)
        if cLine != None and cLine.pos == vLine.position:
            for x,i in exact.iteritems():
                vLine.info[x] = cLine.columns[i]
            for x,i in nearest.iteritems():
                vLine.info[x] = cLine.columns[i]
            for x,i in interpolate.iteritems():
                vLine.info[x] = cLine.columns[i]
        elif cLine != None: # cLine.pos will be > vLine.position
            if not args.omit_mismatches:
                for x,i in exact.iteritems():
                    vLine.info[x] = "."
            if lastCline == None:
                for x,i in nearest.iteritems():
                    vLine.info[x] = cLine.columns[i]
                for x,i in interpolate.iteritems():
                    vLine.info[x] = cLine.columns[i]
            else:
                closestLine = lastCline if vLine.position - lastCline.pos <= cLine.pos - vLine.position else cLine
                for x,i in nearest.iteritems():
                    vLine.info[x] = closestLine.columns[i]
                for x,i in interpolate.iteritems():
                    try:
                        lastVal = float(lastCline.columns[i])
                        nextVal = float(cLine.columns[i])
                        vLine.info[x] = str(lastVal + (nextVal - lastVal)*(vLine.position - lastCline.pos)/(cLine.pos - lastCline.pos))
                    except ValueError:
                        vLine.info[x] = closestLine.columns[i]
        else: # cLine == None
            if lastCline == None:
                if not args.omit_mismatches:
                    for x,i in exact.iteritems():
                        vLine.info[x] = "."
                    for x,i in nearest.iteritems():
                        vLine.info[x] = "."
                    for x,i in interpolate.iteritems():
                        vLine.info[x] = "."
            else:
                if not args.omit_mismatches:
                    for x,i in exact.iteritems():
                        vLine.info[x] = "."
                for x,i in nearest.iteritems():
                    vLine.info[x] = lastCline.columns[i]
                for x,i in interpolate.iteritems():
                    vLine.info[x] = lastCline.columns[i]
        # Okay, we've copied over everything; write the line
        outfile.write(str(vLine))
        # Grab the next one
        vLine = vcffile.readline()
        if not vLine:
            break   # No more variants - we're done!
        vLine = vcfLine(vLine.strip().split('\t'))
        vLine.extractChrAndPos()
        vLine.extractInfo()
    
    csvfile.close()
    vcffile.close()
    outfile.close()
 for line in infile:
     if len(line) <= 1:
         # Skip blank lines
         continue
     elif line.startswith('#'):
         # Skip header pragma lines
         if line.lower().startswith('#chrom'):
             # Get all the individual IDs from the header, write it to the file
             individualIDs = line.strip().split('\t')[9:]
             outfile.write(','.join(individualIDs))
             outfile.write('\n')
     else:
         assert individualIDs != None    # In a well-formed .vcf file, we'll have run across the header line before any data
         
         # Here is where my library comes in - we first build a vcfLine object with the columns in the .vcf file
         line = vcfLine(line.strip().split('\t'))
         
         # Ideally, these two steps would be performed automatically, but sometimes we might want
         # to skip them to save time in practice:
         
         # We have to first extract the alleles before I can reference line.alleles
         line.extractAlleles()
         # We have to first extract all genotypes before I can reference line.genotypes
         line.extractGenotypes()
         
         for i in xrange(len(individualIDs)):
             allele1,allele2,phased,attributes = line.genotypes[i]
             
             outfile.write(line.alleles[allele1])   # write the first allele letters - to write the number, just write allele1
             outfile.write("|" if phased else "/")
             outfile.write(line.alleles[allele2])   # write the second allele letters - to write the number, just write allele2
    for line in infile:
        if len(line) <= 1:
            # Skip blank lines
            continue
        elif line.startswith('#'):
            # Skip header pragma lines
            if line.lower().startswith('#chrom'):
                # Get all the individual IDs from the header, write it to the file
                individualIDs = line.strip().split('\t')[9:]
                outfile.write(','.join(individualIDs))
                outfile.write('\n')
        else:
            assert individualIDs != None  # In a well-formed .vcf file, we'll have run across the header line before any data

            # Here is where my library comes in - we first build a vcfLine object with the columns in the .vcf file
            line = vcfLine(line.strip().split('\t'))

            # Ideally, these two steps would be performed automatically, but sometimes we might want
            # to skip them to save time in practice:

            # We have to first extract the alleles before I can reference line.alleles
            line.extractAlleles()
            # We have to first extract all genotypes before I can reference line.genotypes
            line.extractGenotypes()

            for i in xrange(len(individualIDs)):
                allele1, allele2, phased, attributes = line.genotypes[i]

                outfile.write(
                    line.alleles[allele1]
                )  # write the first allele letters - to write the number, just write allele1
def run(args):
    separateInfoFields = args.separate_info_fields.strip().lower() == "true"
    numberAlleles = args.numbered_alleles.strip().lower() == "true"
    includeGenotypes = args.include_genotypes.strip().lower() == "true"
    includeGenotypeAttributes = args.include_genotype_attributes.strip().lower() == "true"
    if includeGenotypeAttributes:
        includeGenotypes = True
    
    ignoreFields = args.ignore_fields
    if ignoreFields == None:
        ignoreFields = set([])
    ignoreFields = set(ignoreFields)
    
    numAltAlleles = 1
    numFilters = 1
    infoOrder = []
    infoHeaders = {}
    peopleOrder = []
    formatOrder = []
    formatHeaders = {}
    
    infile = open(args.infile,'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1:
            continue
        elif line.startswith("#"):
            temp = line.lower()
            if temp.startswith("##info"):
                newTag = line[temp.find("id=")+3:]
                newTag = newTag[:newTag.find(',')]
                if not newTag in ignoreFields:
                    if not infoHeaders.has_key(newTag):
                        infoOrder.append(newTag)
                    infoHeaders[newTag] = 1
            elif temp.startswith("##format"):
                newTag = line[temp.find("id=")+3:]
                newTag = newTag[:newTag.find(',')]
                if not newTag in ignoreFields and not newTag == 'GT':
                    if not formatHeaders.has_key(newTag):
                        formatOrder.append(newTag)
                        formatHeaders[newTag] = 1
            elif temp.startswith("#chrom"):
                peopleOrder = line.split('\t')[9:]
        else:
            line = vcfLine(line.split('\t'))
            
            if separateInfoFields:
                line.extractAlleles()
                numAltAlleles = max(numAltAlleles,len(line.alleles)-1)  # don't include the REF allele
                
                line.extractFilters()
                numFilters = max(numFilters,len(line.filters))
            
            line.extractInfo()
            for k,v in line.info.iteritems():
                if not infoHeaders.has_key(k):
                    infoOrder.append(newTag)
                    infoHeaders[k] = 1
                if separateInfoFields and isinstance(v,list):
                    infoHeaders[k] = max(infoHeaders[k],len(v))
            
            if includeGenotypeAttributes:
                line.extractFormat()
                line.extractGenotypes()
                for i,p in enumerate(peopleOrder):
                    allele0,allele1,phased,attrs = line.genotypes[i]  # @UnusedVariable
                    for j,f in enumerate(line.format[1:]):
                        if len(attrs) > j:
                            formatHeaders[f] = max(formatHeaders[f],len(attrs[j].split(',')))
    infile.close()
    
    print "Creating file..."
    outfile = open(args.outfile, 'w')
    outfile.write('Chromosome\tPosition\tID\tReference_Allele')
    if separateInfoFields and numAltAlleles > 1:
        for x in xrange(numAltAlleles):
            outfile.write('\tAlternate_Allele_%i' % (x+1))
    else:
        outfile.write('\tAlternate_Allele')
    
    outfile.write('\tQual')
    
    if separateInfoFields and numFilters > 1:
        for x in xrange(numFilters):
            outfile.write('\tFilter_%i' % (x+1))
    else:
        outfile.write('\tFilter')
    
    for i in infoOrder:
        if separateInfoFields and infoHeaders[i] > 1:
            for x in xrange(infoHeaders[i]):
                outfile.write('\t%s_%i' % (i,x+1))
        else:
            outfile.write('\t%s' % i)
    
    if includeGenotypes:
        for p in peopleOrder:
            outfile.write('\t%s_Allele_1' % p)
            outfile.write('\t%s_Allele_2' % p)
            if includeGenotypeAttributes:
                outfile.write('\t%s_Phased' % p)
                for f in formatOrder:
                    if separateInfoFields and formatHeaders[f] > 1:
                        for x in xrange(formatHeaders[f]):
                            outfile.write('\t%s_%s_%i' % (p,f,x+1))
                    else:
                        outfile.write('\t%s_%s' % (p,f))
    outfile.write('\n')
    
    infile = open(args.infile,'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1 or line.startswith("#"):
            continue
        line = vcfLine(line.split('\t'))
        
        line.extractChrAndPos()
        outfile.write("%s\t%i\t%s" % (line.chromosome,line.position,line.name))
        
        line.extractAlleles()
        outfile.write('\t%s' % line.alleles[0])
        if separateInfoFields:
            for a in line.alleles[1:]:
                outfile.write('\t%s' % a)
            x = len(line.alleles)-1
            while x < numAltAlleles:
                outfile.write('\t')
                x += 1
        else:
            outfile.write('\t%s' % ','.join(line.alleles[1:]))
        
        line.extractQual()
        outfile.write("\t%f" % line.qual)
        
        line.extractFilters()
        if separateInfoFields:
            for f in line.filters:
                outfile.write('\t%s' % f)
            x = len(line.filters)
            while x < numFilters:
                outfile.write('\t')
                x += 1
        else:
            outfile.write('\t%s' % ','.join(line.filters))
        
        line.extractInfo()
        for i in infoOrder:
            if not line.info.has_key(i):
                if separateInfoFields:
                    for x in xrange(infoHeaders[i]):
                        outfile.write('\t')
                else:
                    outfile.write('\t')
            else:
                values = line.info[i]
                if not isinstance(values,list):
                    values = [values]
                for j,v in enumerate(values):
                    if v == None:
                        values[j] = i
                if separateInfoFields:
                    for v in values:
                        outfile.write('\t%s' % v)
                    x = len(values)
                    while x < infoHeaders[i]:
                        outfile.write('\t')
                        x += 1
                else:
                    outfile.write('\t%s' % ','.join(values))
        
        if includeGenotypes:
            line.extractFormat()
            line.extractGenotypes()
            for i,p in enumerate(peopleOrder):
                allele0,allele1,phased,attrs = line.genotypes[i]
                if allele0 == None:
                    allele0 = '.'
                elif not numberAlleles:
                    allele0 = line.alleles[allele0]
                if allele1 == None:
                    allele1 = '.'
                elif not numberAlleles:
                    allele1 = line.alleles[allele1]
                outfile.write('\t%s\t%s' % (allele0,allele1))
                if includeGenotypeAttributes:
                    outfile.write('\t%s' % ('Y' if phased else 'N'))
                    for j,f in enumerate(formatOrder):
                        if not f in line.format:
                            if separateInfoFields:
                                for x in xrange(formatHeaders[f]):
                                    outfile.write('\t')
                            else:
                                outfile.write('\t')
                        else:
                            attrIndex = line.format.index(f)-1
                            if attrIndex >= len(attrs):
                                if separateInfoFields:
                                    for x in xrange(formatHeaders[f]):
                                        outfile.write('\t')
                                else:
                                    outfile.write('\t')
                            else:
                                values = attrs[attrIndex].split(',')
                                if separateInfoFields:
                                    for v in values:
                                        outfile.write('\t%s' % v)
                                    x = len(values)
                                    while x < formatHeaders[f]:
                                        outfile.write('\t')
                                        x += 1
                                else:
                                    outfile.write('\t%s' % ','.join(values))
        outfile.write("\n")
    infile.close()
    outfile.close()
示例#12
0
def run(args):

    separateInfoFields = args.separate_info_fields.strip().lower() == "true"
    countSeparate = args.count_separate.strip().lower() == "true"

    ignoreFields = args.ignore_fields
    if ignoreFields == None:
        ignoreFields = []
    ignoreFields = set(ignoreFields)

    posLengthWarned = False
    allChrs = []
    positions = []
    alleleColumn = infoDetails("Ref/Alt", 1, False)
    qualColumn = infoDetails("QUAL", 1, False)
    filterColumn = infoDetails("FILTER", args.max_strings, False)
    infoFields = {
        "Ref/Alt": alleleColumn,
        "QUAL": qualColumn,
        "FILTER": filterColumn
    }
    # TODO: get the numeric ranges, all valid categorical values

    infile = open(args.infile, 'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1:
            continue
        elif line.startswith("#"):
            temp = line.lower()
            if temp.startswith("##info"):
                newTag = line[temp.find("id=") + 3:]
                newTag = newTag[:newTag.find(',')]
                if infoFields.has_key(newTag):
                    raise Exception(
                        "Duplicate INFO ID or use of reserved ID:\t%s" %
                        newTag)
                infoFields[newTag] = infoDetails(newTag, args.max_strings,
                                                 countSeparate)
                if newTag in ignoreFields:
                    infoFields[newTag].maxedOut = True
            elif temp.startswith("##filter"):
                newTag = line[temp.find("id=") + 3:]
                newTag = newTag[:newTag.find(',')]
                filterColumn.addCategory(newTag, 0)
            elif temp.startswith("##contig"):
                chrom = line[temp.find("id=") + 3:]
                chrom = newTag[:chrom.find(',')]
                chrom = standardizeChromosome(chrom)
                chrLength = line[temp.find("length=") + 3:]
                chrLength = chrLength[:chrLength.find(',')]

                allChrs.append(chrom)
                positions.append((0, int(chrLength)))
            else:
                # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields,
                # other strings will make this column max out early
                filterColumn.maxCategories = len(filterColumn.categories[0])
        else:
            line = vcfLine(line.split('\t'))
            line.extractChrAndPos()

            if not line.chromosome not in allChrs:
                allChrs.append(line.chromosome)
                positions.append((0, 0))
            chrIndex = allChrs.index(line.chromosome)
            if line.position > positions[chrIndex][1]:
                positions[chrIndex] = (0, line.position)
                if not posLengthWarned:
                    sys.stderr.write(
                        'WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.'
                    )
                    sys.stderr.write(
                        ' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.'
                    )
                    posLengthWarned = True

            line.extractAlleles()
            alleles = line.alleles
            if not separateInfoFields:
                alleles = ",".join(alleles)
            alleleColumn.addArbitraryValue(line.alleles)

            line.extractQual()
            qualColumn.addArbitraryValue(line.qual)

            line.extractFilters()
            filters = line.filters
            if not separateInfoFields:
                filters = ",".join(filters)
            filterColumn.addArbitraryValue(filters)

            line.extractInfo()
            for k, v in line.info.iteritems():
                if not infoFields.has_key(k):
                    raise Exception("Missing ##INFO pragma for: %s" % k)
                if separateInfoFields:
                    v = ",".split(v)
                infoFields[k].addArbitraryValue(v)
    infile.close()

    print "Creating file..."
    outfile = open(args.outfile, 'w')

    outfile.write("##\t%s created from %s on %s\n" %
                  (args.outfile, args.infile, str(datetime.datetime.now())))
    outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs)))
    outfile.write("#\tPosition\tPOS\t%s\n" %
                  ("\t".join(["(%i,%i)" % p for p in positions])))
    outfile.write("#\tID\tID\n")

    headers = []
    fieldOrder = sorted(infoFields.iterkeys())
    for f in fieldOrder:
        pragmas = infoFields[f].getPragmas()
        for p in pragmas:
            outfile.write(p + "\n")
            h = p.split("\t")[1]
            headers.append(h)

    outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers)))

    infile = open(args.infile, 'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1 or line.startswith("#"):
            continue
        line = vcfLine(line.split('\t'))
        line.extractChrAndPos()
        line.extractInfo()
        line.extractAlleles()
        line.info["Ref/Alt"] = line.alleles
        line.extractQual()
        line.info["QUAL"] = str(line.qual)
        line.extractFilters()
        line.info["FILTER"] = line.filters
        outfile.write("%s\t%i\t%s" %
                      (line.chromosome, line.position, line.name))
        for f in fieldOrder:
            values = line.info[f]
            if isinstance(values, list):
                if separateInfoFields:
                    values = "\t".join(values)
                else:
                    values = ",".join(values)
            outfile.write("\t%s" % values)
        outfile.write("\n")
    infile.close()
    outfile.close()