def run(args): kgp = kgpInterface(args.data, sys.path[0] + "/KGP_populations.txt") outfile = open(args.outfile, 'wb') freqOnly = args.frequencies_only.lower().startswith('t') wroteHeader = False for line in kgp.iterate(): if not wroteHeader: outfile.write('CHROM\tPOS\tID') if not freqOnly: for p in kgp.populations[args.pop]: outfile.write('\t%s_1\t%s_2' % (p, p)) outfile.write('\n') wroteHeader = True line.extractChrAndPos() line.extractAlleles() line.extractGenotypes() outfile.write('\t'.join( [line.chromosome, str(line.position), line.name])) if freqOnly: counts = countingDict() total = 0.0 for p in kgp.populations[args.pop]: i = kgp.individualIndices[p] if line.genotypes[i][0] != None: counts[line.genotypes[i][0]] += 1 total += 1.0 if line.genotypes[i][1] != None: counts[line.genotypes[i][1]] += 1 total += 1.0 for i, c in counts.iteritems(): outfile.write('\t%s:\t%f' % (line.alleles[i], c / total)) else: for p in kgp.populations[args.pop]: i = kgp.individualIndices[p] a1 = line.genotypes[i][0] if a1 == None: a1 = '.' else: a1 = line.alleles[a1] a2 = line.genotypes[i][1] if a2 == None: a2 = '.' else: a2 = line.alleles[a2] outfile.write('\t%s\t%s' % (a1, a2)) outfile.write('\n') outfile.close()
def run(args): kgp = kgpInterface(args.data,sys.path[0] + "/KGP_populations.txt") outfile = open(args.outfile,'wb') freqOnly = args.frequencies_only.lower().startswith('t') wroteHeader = False for line in kgp.iterate(): if not wroteHeader: outfile.write('CHROM\tPOS\tID') if not freqOnly: for p in kgp.populations[args.pop]: outfile.write('\t%s_1\t%s_2' % (p,p)) outfile.write('\n') wroteHeader = True line.extractChrAndPos() line.extractAlleles() line.extractGenotypes() outfile.write('\t'.join([line.chromosome,str(line.position),line.name])) if freqOnly: counts = countingDict() total = 0.0 for p in kgp.populations[args.pop]: i = kgp.individualIndices[p] if line.genotypes[i][0] != None: counts[line.genotypes[i][0]] += 1 total += 1.0 if line.genotypes[i][1] != None: counts[line.genotypes[i][1]] += 1 total += 1.0 for i,c in counts.iteritems(): outfile.write('\t%s:\t%f' % (line.alleles[i],c/total)) else: for p in kgp.populations[args.pop]: i = kgp.individualIndices[p] a1 = line.genotypes[i][0] if a1 == None: a1 = '.' else: a1 = line.alleles[a1] a2 = line.genotypes[i][1] if a2 == None: a2 = '.' else: a2 = line.alleles[a2] outfile.write('\t%s\t%s' % (a1,a2)) outfile.write('\n') outfile.close()
def run(args, tickFunction=tick, numTicks=100): kgp = kgpInterface(args.data, sys.path[0] + "/KGP_populations.txt") outfile, takenTags, headerline, myPopulations, myPopulationIndices = parseVcfHeader( args.infile, args.outfile, args.popFile) statsToCalculate = { } # {INFO tag : (allStats.statistic,targetPop,backgroundPop,"ASC"/"DEC",REF/ALT hack: True/False,backTag))} alleleOrders = {} def storeCalcDetails(stat, calculation): if not len(calculation) > 0: raise Exception('Must specify a target population!') target = calculation[0] background = calculation[1] if len(calculation) > 1 else None direction = calculation[2] if len(calculation) > 2 else 'ASC' hack = len(calculation) > 3 and calculation[3].strip().lower( ).startswith('t') tag = "%s_%s_" % (target, allStats.STAT_NAMES[stat]) if background == None: backTag = "ALT" tag += backTag infoLine = "##INFO=<ID=%s,Number=A,Type=Float,Description=\"calcStats.py: %s for the %s population\">\n" % ( tag, allStats.STAT_NAMES[stat], target) else: backTag = "%s_%s_AO" % (direction, background) temp = backTag dupNumber = 2 while backTag in takenTags or backTag in myPopulations.iterkeys( ) or backTag in kgp.populations.iterkeys(): backTag = temp + str(dupNumber) dupNumber += 1 alleleOrders[backTag] = (direction, background) tag += backTag if hack: tag += "_rHack" infoLine = "##INFO=<ID=%s,Number=.,Type=Float,Description=\"calcStats.py: %s for the %s population, with alleles ordered by %s AF in the %s population (%s).%s\">\n" % ( tag, allStats.STAT_NAMES[stat], target, "ascending" if direction == 'ASC' else "descending", background, backTag, " When %s has no data, the REF/ALT allele order is used." % background if hack else "") dupNumber = 2 temp = tag while tag in takenTags: tag = temp + str(dupNumber) dupNumber += 1 statsToCalculate[tag] = (stat, target, background, direction, hack, backTag) return infoLine if args.calculate_AF != None: for calculation in args.calculate_AF: outfile.write(storeCalcDetails(allStats.AF, calculation)) if args.calculate_Carriage != None: for calculation in args.calculate_Carriage: outfile.write(storeCalcDetails(allStats.Carriage, calculation)) if args.calculate_Samples_w_calls != None: for calculation in args.calculate_Samples_w_calls: assert len(calculation) == 1 outfile.write( storeCalcDetails(allStats.Samples_w_calls, calculation)) for popTag, (direction, background) in alleleOrders.iteritems(): outfile.write( "##INFO=<ID=%s,Number=.,Type=String,Description=\"calcStats.py: All observed alleles for each locus, ordered by %s AF in the %s population.\">\n" % (popTag, "ascending" if direction == 'ASC' else "descending", background)) outfile.write(headerline) def getPopIndices(pop): if myPopulationIndices.has_key(pop): vcfIndices = [] kgpIndices = [] for i in myPopulationIndices[pop]: if isinstance(i, str): if not i in kgp.header[9:]: raise Exception( "Unknown sample (not in your .vcf or the KGP): %s" % i) i = kgp.header.index(i) - 9 kgpIndices.append(i) else: vcfIndices.append(i) else: vcfIndices = [] kgpIndices = kgp.populationIndices[pop] return (vcfIndices, kgpIndices) for vcfLine, kgpLine in kgp.iterateVcf(args.infile, tickFunction=tickFunction, numTicks=numTicks): # first get the allele orders we need, add them as INFO fields alleleLists = {} # popTag : [] vcfLine.extractAlleles() vcfLine.extractInfo() if kgpLine != None: kgpLine.extractAlleles() for popTag, (direction, background) in alleleOrders.iteritems(): vcfIndices, kgpIndices = getPopIndices(background) tempAlleles = set(vcfLine.alleles) if kgpLine != None: tempAlleles.update(kgpLine.alleles) tempAlleles = list(tempAlleles) tempFreqs = allStats.calculate(allStats.AF, vcfLine, vcfIndices, kgpLine, kgpIndices, tempAlleles) if len(tempFreqs) < 1 or math.isinf(tempFreqs[0]): vcfLine.info[popTag] = "." alleleLists[popTag] = None else: if direction == 'ASC': alleleLists[popTag] = sorted( tempAlleles, key=lambda i: tempFreqs[tempAlleles.index(i)]) else: alleleLists[popTag] = sorted( tempAlleles, key=lambda i: tempFreqs[tempAlleles.index(i)], reverse=True) vcfLine.info[popTag] = ",".join(alleleLists[popTag]) # now calculate based on those allele orders for tag, (stat, target, background, direction, hack, backTag) in statsToCalculate.iteritems(): vcfIndices, kgpIndices = getPopIndices(target) if backTag == 'ALT': alleles = vcfLine.alleles else: alleles = alleleLists[backTag] if alleles == None: if hack: alleles = vcfLine.alleles else: vcfLine.info[tag] = "." continue result = allStats.calculate(stat, vcfLine, vcfIndices, kgpLine, kgpIndices, alleles) if isinstance(result, list): result = ",".join([str(r) for r in result]) else: result = str(result) vcfLine.info[tag] = result outfile.write(str(vcfLine)) outfile.close()
def run(args, tickFunction=tick, numTicks=100): kgp = kgpInterface(args.data,sys.path[0] + "/KGP_populations.txt") outfile,takenTags,headerline,myPopulations,myPopulationIndices = parseVcfHeader(args.infile,args.outfile,args.popFile) statsToCalculate = {} # {INFO tag : (allStats.statistic,targetPop,backgroundPop,"ASC"/"DEC",REF/ALT hack: True/False,backTag))} alleleOrders = {} def storeCalcDetails(stat,calculation): if not len(calculation) > 0: raise Exception('Must specify a target population!') target = calculation[0] background = calculation[1] if len(calculation) > 1 else None direction = calculation[2] if len(calculation) > 2 else 'ASC' hack = len(calculation) > 3 and calculation[3].strip().lower().startswith('t') tag = "%s_%s_" % (target,allStats.STAT_NAMES[stat]) if background == None: backTag = "ALT" tag += backTag infoLine = "##INFO=<ID=%s,Number=A,Type=Float,Description=\"calcStats.py: %s for the %s population\">\n" % (tag,allStats.STAT_NAMES[stat],target) else: backTag = "%s_%s_AO" % (direction,background) temp = backTag dupNumber = 2 while backTag in takenTags or backTag in myPopulations.iterkeys() or backTag in kgp.populations.iterkeys(): backTag = temp + str(dupNumber) dupNumber += 1 alleleOrders[backTag] = (direction,background) tag += backTag if hack: tag += "_rHack" infoLine = "##INFO=<ID=%s,Number=.,Type=Float,Description=\"calcStats.py: %s for the %s population, with alleles ordered by %s AF in the %s population (%s).%s\">\n" % (tag, allStats.STAT_NAMES[stat], target, "ascending" if direction == 'ASC' else "descending", background, backTag, " When %s has no data, the REF/ALT allele order is used." % background if hack else "") dupNumber = 2 temp = tag while tag in takenTags: tag = temp + str(dupNumber) dupNumber += 1 statsToCalculate[tag] = (stat,target,background,direction,hack,backTag) return infoLine if args.calculate_AF != None: for calculation in args.calculate_AF: outfile.write(storeCalcDetails(allStats.AF,calculation)) if args.calculate_Carriage != None: for calculation in args.calculate_Carriage: outfile.write(storeCalcDetails(allStats.Carriage,calculation)) if args.calculate_Samples_w_calls != None: for calculation in args.calculate_Samples_w_calls: assert len(calculation) == 1 outfile.write(storeCalcDetails(allStats.Samples_w_calls,calculation)) for popTag,(direction,background) in alleleOrders.iteritems(): outfile.write("##INFO=<ID=%s,Number=.,Type=String,Description=\"calcStats.py: All observed alleles for each locus, ordered by %s AF in the %s population.\">\n" % (popTag, "ascending" if direction == 'ASC' else "descending", background)) outfile.write(headerline) def getPopIndices(pop): if myPopulationIndices.has_key(pop): vcfIndices = [] kgpIndices = [] for i in myPopulationIndices[pop]: if isinstance(i,str): if not i in kgp.header[9:]: raise Exception("Unknown sample (not in your .vcf or the KGP): %s" % i) i = kgp.header.index(i)-9 kgpIndices.append(i) else: vcfIndices.append(i) else: vcfIndices = [] kgpIndices = kgp.populationIndices[pop] return (vcfIndices,kgpIndices) for vcfLine,kgpLine in kgp.iterateVcf(args.infile,tickFunction=tickFunction,numTicks=numTicks): # first get the allele orders we need, add them as INFO fields alleleLists = {} # popTag : [] vcfLine.extractAlleles() vcfLine.extractInfo() if kgpLine != None: kgpLine.extractAlleles() for popTag,(direction,background) in alleleOrders.iteritems(): vcfIndices,kgpIndices = getPopIndices(background) tempAlleles = set(vcfLine.alleles) if kgpLine != None: tempAlleles.update(kgpLine.alleles) tempAlleles = list(tempAlleles) tempFreqs = allStats.calculate(allStats.AF,vcfLine,vcfIndices,kgpLine,kgpIndices,tempAlleles) if len(tempFreqs) < 1 or math.isinf(tempFreqs[0]): vcfLine.info[popTag] = "." alleleLists[popTag] = None else: if direction == 'ASC': alleleLists[popTag] = sorted(tempAlleles,key=lambda i:tempFreqs[tempAlleles.index(i)]) else: alleleLists[popTag] = sorted(tempAlleles,key=lambda i:tempFreqs[tempAlleles.index(i)],reverse=True) vcfLine.info[popTag] = ",".join(alleleLists[popTag]) # now calculate based on those allele orders for tag,(stat,target,background,direction,hack,backTag) in statsToCalculate.iteritems(): vcfIndices,kgpIndices = getPopIndices(target) if backTag == 'ALT': alleles = vcfLine.alleles else: alleles = alleleLists[backTag] if alleles == None: if hack: alleles = vcfLine.alleles else: vcfLine.info[tag] = "." continue result = allStats.calculate(stat,vcfLine,vcfIndices,kgpLine,kgpIndices,alleles) if isinstance(result,list): result = ",".join([str(r) for r in result]) else: result = str(result) vcfLine.info[tag] = result outfile.write(str(vcfLine)) outfile.close()