def makeVCFline(scaffold, position, GTdict, names, refDict=None, genoFormat=None): genomeSite = genomics.GenomeSite(genoDict=GTdict, sampleNames=names, genoFormat=genoFormat) alleles = genomeSite.alleles(byFreq=True) if alleles == []: alleles = ["N"] if refDict: refBase = refDict[scaffold][int(position) - 1] if refBase in alleles: alleles.pop(alleles.index(refBase)) alleles = [refBase] + alleles else: refBase = alleles[0] alt = alleles[1:] if alt == []: alt = ["."] codedGenos = genomeSite.asList(mode="coded", alleles=alleles) output = [ scaffold, str(position), ".", refBase, ",".join(alt), ".", ".", ".", "GT" ] + codedGenos return "\t".join(output)
def getPopIndBaseCounts(siteData, genoFormat, allSamples, popDict, ploidyDict): site = genomics.GenomeSite(genotypes=[siteData["GTs"][name] for name in allSamples], sampleNames=allSamples, popDict=popDict, ploidyDict=ploidyDict, genoFormat=args.genoFormat) popIndBaseCounts = dict([(popName, np.array([site.genotypes[indName].asBaseCounts() for indName in popDict[popName]]),) for popName in popNames]) return popIndBaseCounts
def analysisWrapper(inQueue, outQueue, inputGenoFormat, outputGenoFormat, headers, include, exclude, samples, minCalls, minPopCalls, minAlleles, maxAlleles, minVarCount, maxHet, minFreq, maxFreq, HWE_P, HWE_side, popDict, ploidyDict, fixed, nearlyFixedDiff, forcePloidy, thinDist, noTest): sampleIndices = [headers.index(s) for s in samples] while True: podNumber, inPod = inQueue.get() if verbose: print >> sys.stderr, "Pod", podNumber, "received for analysis." outPod = [] lastScaf = None for lineData in inPod: lineNumber, line = lineData #if verbose: print >> sys.stderr, "Analysing line", lineNumber objects = line.split() if (include and objects[0] not in include) or (exclude and objects[0] in exclude): continue site = genomics.GenomeSite( genotypes=[objects[i] for i in sampleIndices], sampleNames=samples, popDict=popDict, ploidyDict=ploidyDict, genoFormat=inputGenoFormat, forcePloidy=forcePloidy) goodSite = True if thinDist: pos = int(objects[1]) if lastScaf != objects[0]: lastPos = pos lastScaf = objects[0] goodSite = False elif pos - lastPos < thinDist: goodSite = False if goodSite and not noTest: goodSite = genomics.siteTest(site, samples=samples, minCalls=minCalls, minPopCalls=minPopCalls, minAlleles=minAlleles, maxAlleles=maxAlleles, minVarCount=minVarCount, maxHet=maxHet, minFreq=minFreq, maxFreq=maxFreq, HWE_P=HWE_P, HWE_side=HWE_side, fixed=fixed, nearlyFixedDiff=nearlyFixedDiff) if goodSite: outLine = "\t".join(objects[:2] + [ str(g) for g in site.asList(samples, mode=outputGenoFormat) ]) + "\n" outPod.append((lineNumber, outLine)) if thinDist: lastPos = int(objects[1]) #if verbose: print >> sys.stderr, objects[0], objects[1], "passed: ", goodSite outQueue.put((podNumber, outPod)) if verbose: print >> sys.stderr, "Pod", podNumber, "analysed, sent to sorter."
continue #if there are intervals, check whether the site matches any if intervalsFile: siteIntervals = whichInterval(siteData["scaffold"], siteData["position"], scafIntervals, intervalPosDict) else: siteIntervals = [0] if not siteIntervals: continue site = genomics.GenomeSite( genotypes=[siteData["GTs"][name] for name in allSamples], sampleNames=allSamples, popDict=popDict, ploidyDict=ploidyDict, genoFormat=args.genoFormat) popIndBaseCounts = dict([( popName, np.array([ site.genotypes[indName].asBaseCounts() for indName in popDict[popName] ]), ) for popName in popNames]) # get population basec counts, and do the subsampling if necessary # This is currently conservative. If any one of the populations lacks sufficient good genotypes it will break # in theory we could modify this part to use info for the pops it can - might be necessary when sites are limited
and objects[0] in exclude): continue #if there are intervals, check whether the site matches any if intervalsFile: siteIntervals = whichInterval(objects[0], int(objects[1]), scafIntervals, intervalPosDict) else: siteIntervals = [0] if not siteIntervals: continue site = genomics.GenomeSite(genotypes=[objects[i] for i in sampleIndices], sampleNames=allSamples, popDict=popDict, ploidyDict=ploidyDict, genoFormat=args.genoFormat) popIndBaseCounts = dict([( popName, np.array([ site.genotypes[indName].asBaseCounts() for indName in popDict[popName] ]), ) for popName in popNames]) # get population basec counts, and do the subsampling if necessary # This is currently conservative. If any one of the populations lacks sufficient good genotypes it will break # in theory we could modify this part to use info for the pops it can - might be necessary when sites are limited
linesDone = 0 scaf = None chrom = None pos = 0 if args.cumulativePos: #dict giving the last known position of each chrom from the previous scaffold chromOffset = dict(zip(chromDict.keys(), [0] * len(chromDict))) chromOffset[str(args.nullChrom)] = 0 for line in genoFile: site = genomics.parseGenoLine(line) genomeSite = genomics.GenomeSite(genotypes=site.GTs, sampleNames=allNames, genoFormat=args.genoFormat) if len(genomeSite.alleles()) == 2: counts = genomeSite.asList(mode="count", samples=samples, missing=9) genoOut.write("".join([str(c) for c in counts]) + "\n") if site.scaffold != scaf: #different scaffold from the last site #if using cumulative positions, change the offset for the last chrom if chrom is not None and args.cumulativePos: chromOffset[chrom] = pos #now get new scaf, chrom and pos scaf = site.scaffold try:
linesDone = 0 scaf = None chrom = None pos = 0 if args.cumulativePos: #dict giving the last known position of each chrom from the previous scaffold chromOffset = dict(zip(chromDict.keys(), [0]*len(chromDict))) chromOffset[str(args.nullChrom)] = 0 for siteData in reader.siteBySite(): genomeSite = genomics.GenomeSite(genoDict = siteData["GTs"], genoFormat=args.genoFormat) alleles = genomeSite.alleles() if len(alleles) == 2: counts = genomeSite.asList(mode="count", samples=samples, missing = 9) genoOut.write("".join([str(c) for c in counts]) + "\n") if siteData["scaffold"] != scaf: #different scaffold from the last site #if using cumulative positions, change the offset for the last chrom if chrom is not None and args.cumulativePos: chromOffset[chrom] = pos #now get new scaf, chrom and pos scaf = siteData["scaffold"] try: chrom = chromDict[scaf]
def analysisWrapper(inQueue, outQueue, inputGenoFormat, headers, include, exclude, group1inds, group2inds, permutations, permutationMaxP): samples = group1inds + group2inds sampleIndices = [headers.index(s) for s in samples] group1 = np.array([True] * len(group1inds) + [False] * len(group2inds)) group2 = ~group1 while True: podNumber, inPod = inQueue.get() if verbose: sys.stderr.write( "Pod {} received for analysis...\n".format(podNumber)) outPod = [] for lineData in inPod: lineNumber, line = lineData #if verbose: print >> sys.stderr, "Analysing line", lineNumber objects = line.split() if (include and objects[0] not in include) or (exclude and objects[0] in exclude): continue site = genomics.GenomeSite( genotypes=[objects[i] for i in sampleIndices], sampleNames=samples, genoFormat=inputGenoFormat) alleles = site.alleles() if len(alleles) == 2: minorCount = np.array( site.asList(mode="count", countAllele=alleles[1], missing=-1)) majorCount = np.array( site.asList(mode="count", countAllele=alleles[0], missing=-1)) #get index for good genotypes and filter all by that idx = np.where(minorCount >= 0)[0] _group1_ = group1[idx] _group2_ = group2[idx] minorPresent = minorCount[idx] >= 1 minorAbsent = ~minorPresent majorPresent = majorCount[idx] >= 1 majorAbsent = ~majorPresent minorTable = np.array([[(minorPresent & _group1_).sum(), (minorAbsent & _group1_).sum()], [(minorPresent & _group2_).sum(), (minorAbsent & _group2_).sum()]]) majorTable = np.array([[(majorPresent & _group1_).sum(), (majorAbsent & _group1_).sum()], [(majorPresent & _group2_).sum(), (majorAbsent & _group2_).sum()]]) p_values = ( fisher_exact(minorTable)[1], fisher_exact(majorTable)[1], ) result = [min(p_values)] if permutations >= 1: if permutationMaxP is None or result[0] <= permutationMaxP: table = minorTable if p_values[0] <= p_values[ 1] else majorTable phi = chisquare(table, axis=None)[0] / table.sum() phi_permuted = [] for i in range(permutations): newGroup1 = np.random.permutation(_group1_) newGroup2 = ~newGroup1 newTable = np.array( [[(minorPresent & newGroup1).sum(), (minorAbsent & newGroup1).sum()], [(minorPresent & newGroup2).sum(), (minorAbsent & newGroup2).sum()]]) phi_permuted.append( chisquare(newTable, axis=None)[0] / table.sum()) p_emp = (len( [_phi_ for _phi_ in phi_permuted if _phi_ >= phi]) + 1.) / (permutations + 1.) else: p_emp = np.NaN result.append(p_emp) elif permutations >= 1: result = [np.NaN] * 2 else: result = [np.NaN] outLine = "\t".join(objects[:2] + [str(round(x, 5)) for x in result]) + "\n" outPod.append((lineNumber, outLine)) outQueue.put((podNumber, outPod)) if verbose: sys.stderr.write( "Pod {} analysed, sent to sorter.\n".format(podNumber))