def run(args):
    kgp = kgpInterface(args.data, sys.path[0] + "/KGP_populations.txt")
    outfile = open(args.outfile, 'wb')
    freqOnly = args.frequencies_only.lower().startswith('t')

    wroteHeader = False
    for line in kgp.iterate():
        if not wroteHeader:
            outfile.write('CHROM\tPOS\tID')
            if not freqOnly:
                for p in kgp.populations[args.pop]:
                    outfile.write('\t%s_1\t%s_2' % (p, p))
            outfile.write('\n')
            wroteHeader = True
        line.extractChrAndPos()
        line.extractAlleles()
        line.extractGenotypes()
        outfile.write('\t'.join(
            [line.chromosome, str(line.position), line.name]))
        if freqOnly:
            counts = countingDict()
            total = 0.0
            for p in kgp.populations[args.pop]:
                i = kgp.individualIndices[p]
                if line.genotypes[i][0] != None:
                    counts[line.genotypes[i][0]] += 1
                    total += 1.0
                if line.genotypes[i][1] != None:
                    counts[line.genotypes[i][1]] += 1
                    total += 1.0
            for i, c in counts.iteritems():
                outfile.write('\t%s:\t%f' % (line.alleles[i], c / total))
        else:
            for p in kgp.populations[args.pop]:
                i = kgp.individualIndices[p]
                a1 = line.genotypes[i][0]
                if a1 == None:
                    a1 = '.'
                else:
                    a1 = line.alleles[a1]
                a2 = line.genotypes[i][1]
                if a2 == None:
                    a2 = '.'
                else:
                    a2 = line.alleles[a2]

                outfile.write('\t%s\t%s' % (a1, a2))
        outfile.write('\n')
    outfile.close()
def run(args):
    kgp = kgpInterface(args.data,sys.path[0] + "/KGP_populations.txt")
    outfile = open(args.outfile,'wb')
    freqOnly = args.frequencies_only.lower().startswith('t')
    
    wroteHeader = False
    for line in kgp.iterate():
        if not wroteHeader:
            outfile.write('CHROM\tPOS\tID')
            if not freqOnly:
                for p in kgp.populations[args.pop]:
                    outfile.write('\t%s_1\t%s_2' % (p,p))
            outfile.write('\n')
            wroteHeader = True
        line.extractChrAndPos()
        line.extractAlleles()
        line.extractGenotypes()
        outfile.write('\t'.join([line.chromosome,str(line.position),line.name]))
        if freqOnly:
            counts = countingDict()
            total = 0.0
            for p in kgp.populations[args.pop]:
                i = kgp.individualIndices[p]
                if line.genotypes[i][0] != None:
                    counts[line.genotypes[i][0]] += 1
                    total += 1.0
                if line.genotypes[i][1] != None:
                    counts[line.genotypes[i][1]] += 1
                    total += 1.0
            for i,c in counts.iteritems():
                outfile.write('\t%s:\t%f' % (line.alleles[i],c/total))
        else:
            for p in kgp.populations[args.pop]:
                i = kgp.individualIndices[p]
                a1 = line.genotypes[i][0]
                if a1 == None:
                    a1 = '.'
                else:
                    a1 = line.alleles[a1]
                a2 = line.genotypes[i][1]
                if a2 == None:
                    a2 = '.'
                else:
                    a2 = line.alleles[a2]
                
                outfile.write('\t%s\t%s' % (a1,a2))
        outfile.write('\n')
    outfile.close()
Exemplo n.º 3
0
def run(args, tickFunction=tick, numTicks=100):
    kgp = kgpInterface(args.data, sys.path[0] + "/KGP_populations.txt")
    outfile, takenTags, headerline, myPopulations, myPopulationIndices = parseVcfHeader(
        args.infile, args.outfile, args.popFile)

    statsToCalculate = {
    }  # {INFO tag : (allStats.statistic,targetPop,backgroundPop,"ASC"/"DEC",REF/ALT hack: True/False,backTag))}
    alleleOrders = {}

    def storeCalcDetails(stat, calculation):
        if not len(calculation) > 0:
            raise Exception('Must specify a target population!')
        target = calculation[0]
        background = calculation[1] if len(calculation) > 1 else None
        direction = calculation[2] if len(calculation) > 2 else 'ASC'
        hack = len(calculation) > 3 and calculation[3].strip().lower(
        ).startswith('t')
        tag = "%s_%s_" % (target, allStats.STAT_NAMES[stat])
        if background == None:
            backTag = "ALT"
            tag += backTag
            infoLine = "##INFO=<ID=%s,Number=A,Type=Float,Description=\"calcStats.py: %s for the %s population\">\n" % (
                tag, allStats.STAT_NAMES[stat], target)
        else:
            backTag = "%s_%s_AO" % (direction, background)
            temp = backTag
            dupNumber = 2
            while backTag in takenTags or backTag in myPopulations.iterkeys(
            ) or backTag in kgp.populations.iterkeys():
                backTag = temp + str(dupNumber)
                dupNumber += 1
            alleleOrders[backTag] = (direction, background)
            tag += backTag
            if hack:
                tag += "_rHack"
            infoLine = "##INFO=<ID=%s,Number=.,Type=Float,Description=\"calcStats.py: %s for the %s population, with alleles ordered by %s AF in the %s population (%s).%s\">\n" % (
                tag, allStats.STAT_NAMES[stat], target, "ascending"
                if direction == 'ASC' else "descending", background, backTag,
                " When %s has no data, the REF/ALT allele order is used." %
                background if hack else "")
        dupNumber = 2
        temp = tag
        while tag in takenTags:
            tag = temp + str(dupNumber)
            dupNumber += 1
        statsToCalculate[tag] = (stat, target, background, direction, hack,
                                 backTag)
        return infoLine

    if args.calculate_AF != None:
        for calculation in args.calculate_AF:
            outfile.write(storeCalcDetails(allStats.AF, calculation))
    if args.calculate_Carriage != None:
        for calculation in args.calculate_Carriage:
            outfile.write(storeCalcDetails(allStats.Carriage, calculation))
    if args.calculate_Samples_w_calls != None:
        for calculation in args.calculate_Samples_w_calls:
            assert len(calculation) == 1
            outfile.write(
                storeCalcDetails(allStats.Samples_w_calls, calculation))

    for popTag, (direction, background) in alleleOrders.iteritems():
        outfile.write(
            "##INFO=<ID=%s,Number=.,Type=String,Description=\"calcStats.py: All observed alleles for each locus, ordered by %s AF in the %s population.\">\n"
            % (popTag, "ascending" if direction == 'ASC' else "descending",
               background))
    outfile.write(headerline)

    def getPopIndices(pop):
        if myPopulationIndices.has_key(pop):
            vcfIndices = []
            kgpIndices = []
            for i in myPopulationIndices[pop]:
                if isinstance(i, str):
                    if not i in kgp.header[9:]:
                        raise Exception(
                            "Unknown sample (not in your .vcf or the KGP): %s"
                            % i)
                    i = kgp.header.index(i) - 9
                    kgpIndices.append(i)
                else:
                    vcfIndices.append(i)
        else:
            vcfIndices = []
            kgpIndices = kgp.populationIndices[pop]
        return (vcfIndices, kgpIndices)

    for vcfLine, kgpLine in kgp.iterateVcf(args.infile,
                                           tickFunction=tickFunction,
                                           numTicks=numTicks):
        # first get the allele orders we need, add them as INFO fields
        alleleLists = {}  # popTag : []
        vcfLine.extractAlleles()
        vcfLine.extractInfo()
        if kgpLine != None:
            kgpLine.extractAlleles()

        for popTag, (direction, background) in alleleOrders.iteritems():
            vcfIndices, kgpIndices = getPopIndices(background)
            tempAlleles = set(vcfLine.alleles)
            if kgpLine != None:
                tempAlleles.update(kgpLine.alleles)
            tempAlleles = list(tempAlleles)
            tempFreqs = allStats.calculate(allStats.AF, vcfLine, vcfIndices,
                                           kgpLine, kgpIndices, tempAlleles)
            if len(tempFreqs) < 1 or math.isinf(tempFreqs[0]):
                vcfLine.info[popTag] = "."
                alleleLists[popTag] = None
            else:
                if direction == 'ASC':
                    alleleLists[popTag] = sorted(
                        tempAlleles,
                        key=lambda i: tempFreqs[tempAlleles.index(i)])
                else:
                    alleleLists[popTag] = sorted(
                        tempAlleles,
                        key=lambda i: tempFreqs[tempAlleles.index(i)],
                        reverse=True)
                vcfLine.info[popTag] = ",".join(alleleLists[popTag])

        # now calculate based on those allele orders
        for tag, (stat, target, background, direction, hack,
                  backTag) in statsToCalculate.iteritems():
            vcfIndices, kgpIndices = getPopIndices(target)
            if backTag == 'ALT':
                alleles = vcfLine.alleles
            else:
                alleles = alleleLists[backTag]
            if alleles == None:
                if hack:
                    alleles = vcfLine.alleles
                else:
                    vcfLine.info[tag] = "."
                    continue
            result = allStats.calculate(stat, vcfLine, vcfIndices, kgpLine,
                                        kgpIndices, alleles)
            if isinstance(result, list):
                result = ",".join([str(r) for r in result])
            else:
                result = str(result)
            vcfLine.info[tag] = result

        outfile.write(str(vcfLine))

    outfile.close()
def run(args, tickFunction=tick, numTicks=100):
    kgp = kgpInterface(args.data,sys.path[0] + "/KGP_populations.txt")
    outfile,takenTags,headerline,myPopulations,myPopulationIndices = parseVcfHeader(args.infile,args.outfile,args.popFile)
    
    statsToCalculate = {}   # {INFO tag : (allStats.statistic,targetPop,backgroundPop,"ASC"/"DEC",REF/ALT hack: True/False,backTag))}
    alleleOrders = {}
    
    def storeCalcDetails(stat,calculation):
        if not len(calculation) > 0:
            raise Exception('Must specify a target population!')
        target = calculation[0]
        background = calculation[1] if len(calculation) > 1 else None
        direction = calculation[2] if len(calculation) > 2 else 'ASC'
        hack = len(calculation) > 3 and calculation[3].strip().lower().startswith('t')
        tag = "%s_%s_" % (target,allStats.STAT_NAMES[stat])
        if background == None:
            backTag = "ALT"
            tag += backTag
            infoLine = "##INFO=<ID=%s,Number=A,Type=Float,Description=\"calcStats.py: %s for the %s population\">\n" % (tag,allStats.STAT_NAMES[stat],target)
        else:
            backTag = "%s_%s_AO" % (direction,background)
            temp = backTag
            dupNumber = 2
            while backTag in takenTags or backTag in myPopulations.iterkeys() or backTag in kgp.populations.iterkeys():
                backTag = temp + str(dupNumber)
                dupNumber += 1
            alleleOrders[backTag] = (direction,background)
            tag += backTag
            if hack:
                tag += "_rHack"
            infoLine = "##INFO=<ID=%s,Number=.,Type=Float,Description=\"calcStats.py: %s for the %s population, with alleles ordered by %s AF in the %s population (%s).%s\">\n" % (tag,
                       allStats.STAT_NAMES[stat],
                       target,
                       "ascending" if direction == 'ASC' else "descending",
                       background,
                       backTag,
                       " When %s has no data, the REF/ALT allele order is used." % background if hack else "")
        dupNumber = 2
        temp = tag
        while tag in takenTags:
            tag = temp + str(dupNumber)
            dupNumber += 1
        statsToCalculate[tag] = (stat,target,background,direction,hack,backTag)
        return infoLine
    
    if args.calculate_AF != None:
        for calculation in args.calculate_AF:
            outfile.write(storeCalcDetails(allStats.AF,calculation))
    if args.calculate_Carriage != None:
        for calculation in args.calculate_Carriage:
            outfile.write(storeCalcDetails(allStats.Carriage,calculation))
    if args.calculate_Samples_w_calls != None:
        for calculation in args.calculate_Samples_w_calls:
            assert len(calculation) == 1
            outfile.write(storeCalcDetails(allStats.Samples_w_calls,calculation))
    
    for popTag,(direction,background) in alleleOrders.iteritems():
        outfile.write("##INFO=<ID=%s,Number=.,Type=String,Description=\"calcStats.py: All observed alleles for each locus, ordered by %s AF in the %s population.\">\n" % (popTag,
                      "ascending" if direction == 'ASC' else "descending",
                      background))
    outfile.write(headerline)
    
    def getPopIndices(pop):
        if myPopulationIndices.has_key(pop):
            vcfIndices = []
            kgpIndices = []
            for i in myPopulationIndices[pop]:
                if isinstance(i,str):
                    if not i in kgp.header[9:]:
                        raise Exception("Unknown sample (not in your .vcf or the KGP): %s" % i)
                    i = kgp.header.index(i)-9
                    kgpIndices.append(i)
                else:
                    vcfIndices.append(i)
        else:
            vcfIndices = []
            kgpIndices = kgp.populationIndices[pop]
        return (vcfIndices,kgpIndices)
    
    for vcfLine,kgpLine in kgp.iterateVcf(args.infile,tickFunction=tickFunction,numTicks=numTicks):
        # first get the allele orders we need, add them as INFO fields
        alleleLists = {}    # popTag : []
        vcfLine.extractAlleles()
        vcfLine.extractInfo()
        if kgpLine != None:
            kgpLine.extractAlleles()
        
        for popTag,(direction,background) in alleleOrders.iteritems():
            vcfIndices,kgpIndices = getPopIndices(background)
            tempAlleles = set(vcfLine.alleles)
            if kgpLine != None:
                tempAlleles.update(kgpLine.alleles)
            tempAlleles = list(tempAlleles)
            tempFreqs = allStats.calculate(allStats.AF,vcfLine,vcfIndices,kgpLine,kgpIndices,tempAlleles)
            if len(tempFreqs) < 1 or math.isinf(tempFreqs[0]):
                vcfLine.info[popTag] = "."
                alleleLists[popTag] = None
            else:
                if direction == 'ASC':
                    alleleLists[popTag] = sorted(tempAlleles,key=lambda i:tempFreqs[tempAlleles.index(i)])
                else:
                    alleleLists[popTag] = sorted(tempAlleles,key=lambda i:tempFreqs[tempAlleles.index(i)],reverse=True)
                vcfLine.info[popTag] = ",".join(alleleLists[popTag])
        
        # now calculate based on those allele orders
        for tag,(stat,target,background,direction,hack,backTag) in statsToCalculate.iteritems():
            vcfIndices,kgpIndices = getPopIndices(target)
            if backTag == 'ALT':
                alleles = vcfLine.alleles
            else:
                alleles = alleleLists[backTag]
            if alleles == None:
                if hack:
                    alleles = vcfLine.alleles
                else:
                    vcfLine.info[tag] = "."
                    continue
            result = allStats.calculate(stat,vcfLine,vcfIndices,kgpLine,kgpIndices,alleles)
            if isinstance(result,list):
                result = ",".join([str(r) for r in result])
            else:
                result = str(result)
            vcfLine.info[tag] = result
        
        outfile.write(str(vcfLine))
    
    outfile.close()