def objListUtility_mafBlockCounts(data, mb, featLen, numBins): """ Utility function for the MafBlock instance version of libMafGffPlot.objListToBinnedWiggle() This is by far the most costly routine in the objList creation process """ from libMafGffPlot import objListUtility_rangeToPos import numpy length = mb.refEnd - (mb.refStart + 1) # tough to follow index hack to get around the fact that numpy will not use # += 1 for a list of indices that contain repeats. pos = objListUtility_rangeToPos(mb.refStart, mb.refEnd, featLen, numBins) plo, phi = pos.min(), pos.max() pbins = numpy.bincount(pos - plo) data['maf'][plo:phi + 1] += pbins for i in xrange(2, 8): if length >= 10**i: data['maf1e%d' % i][plo:phi + 1] += pbins if mb.spl >= 10**i: data['mafSpl1e%d' % i][plo:phi + 1] += pbins if mb.pairTotalLength >= 10**i: data['mafCtg1e%d' % i][plo:phi + 1] += pbins if mb.hpl >= 10**i: data['mafCpl1e%d' % i][plo:phi + 1] += pbins
def objListUtility_mafBlockCounts(data, mb, featLen, numBins): """ Utility function for the MafBlock instance version of libMafGffPlot.objListToBinnedWiggle() This is by far the most costly routine in the objList creation process """ from libMafGffPlot import objListUtility_rangeToPos import numpy length = mb.refEnd - (mb.refStart + 1) # tough to follow index hack to get around the fact that numpy will not use # += 1 for a list of indices that contain repeats. pos = objListUtility_rangeToPos(mb.refStart, mb.refEnd, featLen, numBins) plo, phi = pos.min(), pos.max() pbins = numpy.bincount(pos - plo) data["maf"][plo : phi + 1] += pbins for i in xrange(2, 8): if length >= 10 ** i: data["maf1e%d" % i][plo : phi + 1] += pbins if mb.spl >= 10 ** i: data["mafSpl1e%d" % i][plo : phi + 1] += pbins if mb.pairTotalLength >= 10 ** i: data["mafCtg1e%d" % i][plo : phi + 1] += pbins if mb.hpl >= 10 ** i: data["mafCpl1e%d" % i][plo : phi + 1] += pbins
def objListToBinnedWiggle(objList, featLen, numBins, filename): """ obj can be either a GffRecord object or a MafBlock object. featLen is the length of the chromosome. returns a numpy vector of length numBins normalized by the maximum possible number of bases per bin. """ from libMafGffPlot import GffRecord from libMafGffPlot import MafBlock from libMafGffPlot import newMafWigDict from libMafGffPlot import objListUtility_xAxis import numpy import sys if objList is None or len(objList) < 1: return None if isinstance(objList[0], GffRecord): """ the Gff return is a single numpy vector of numBins length """ data = {} # populate xAxis data['xAxis'] = objListUtility_xAxis(featLen, numBins) annotTypes = set( ['CDS', 'UTR', 'NXE', 'NGE', 'island', 'tandem', 'repeat']) for t in annotTypes: data[t + 'Count'] = numpy.zeros(shape=(numBins)) data[t + 'Max'] = 0 for a in objList: if a.type not in annotTypes: continue # verify input if a.start > featLen or a.end > featLen: sys.stderr.write( 'libMafGffPlot.py: file %s has annotation on chr %s ' 'with bounds [%d - %d] which are beyond featLen (%d)\n' % (filename, a.chr, a.start, a.end, featLen)) sys.exit(1) # index position in a 'numBins' length array. pos = objListUtility_rangeToPos(a.start, a.end, featLen, numBins) # tough to follow index hack to get around the fact that numpy will not use # += 1 for a list of indices that contain repeats. plo, phi = pos.min(), pos.max() pbins = numpy.bincount(pos - plo) data[a.type + 'Count'][plo:phi + 1] += pbins for p in pos: if data[a.type + 'Max'] < data[a.type + 'Count'][p]: data[a.type + 'Max'] = data[a.type + 'Count'][p] return data elif isinstance(objList[0], MafBlock): """ the Maf return is a dictionary with the following keys maf all maf block bases maf1e2 maf blocks 100 or greater maf1e3 maf blocks 1,000 or greater maf1e4 maf blocks 10,000 or greater maf1e5 maf blocks 100,000 or greater maf1e6 maf blocks 1,000,000 or greater maf1e7 maf blocks 10,000,000 or greater xAxis x Values mafCpl1eX maf contig paths of X or greater mafCtg1eX maf contigs of X or greater. taken from totalLength field of maf. mafSpl1eX maf scaffold paths of X or greater mafCpEdgeCounts each contig path has two edges, a left and a right mafCpEdgeMax max count mafCpErrorCounts contig paths are made up of segments, segments may have errors at junctions. mafCpErrorMax max count mafSpEdgeCounts Same as above, but for scaffold paths mafSpEdgeMax mafSpErrorCounts mafSpErrorMax blockEdgeCounts each block has two edges, a left and a right blockEdgeMax max count """ from libMafGffPlot import objListUtility_addContigPathEdgeErrors from libMafGffPlot import objListUtility_addBlockEdges from libMafGffPlot import objListUtility_normalizeCategories data = newMafWigDict(numBins) # populate xAxis data['xAxis'] = objListUtility_xAxis(featLen, numBins) for mb in objList: # do block edges objListUtility_addBlockEdges(data, mb, featLen, numBins) # do contige path edges and errors objListUtility_addContigPathEdgeErrors(data, mb, featLen, numBins) # do all of the different maf block flavors objListUtility_mafBlockCounts(data, mb, featLen, numBins) # normalize all categories objListUtility_normalizeCategories(data, featLen, numBins) return data # closing the elif isinstance() checks else: return None
def objListToBinnedWiggle(objList, featLen, numBins, filename): """ obj can be either a GffRecord object or a MafBlock object. featLen is the length of the chromosome. returns a numpy vector of length numBins normalized by the maximum possible number of bases per bin. """ from libMafGffPlot import GffRecord from libMafGffPlot import MafBlock from libMafGffPlot import newMafWigDict from libMafGffPlot import objListUtility_xAxis import numpy import sys if objList is None or len(objList) < 1: return None if isinstance(objList[0], GffRecord): """ the Gff return is a single numpy vector of numBins length """ data = {} # populate xAxis data["xAxis"] = objListUtility_xAxis(featLen, numBins) annotTypes = set(["CDS", "UTR", "NXE", "NGE", "island", "tandem", "repeat"]) for t in annotTypes: data[t + "Count"] = numpy.zeros(shape=(numBins)) data[t + "Max"] = 0 for a in objList: if a.type not in annotTypes: continue # verify input if a.start > featLen or a.end > featLen: sys.stderr.write( "libMafGffPlot.py: file %s has annotation on chr %s " "with bounds [%d - %d] which are beyond featLen (%d)\n" % (filename, a.chr, a.start, a.end, featLen) ) sys.exit(1) # index position in a 'numBins' length array. pos = objListUtility_rangeToPos(a.start, a.end, featLen, numBins) # tough to follow index hack to get around the fact that numpy will not use # += 1 for a list of indices that contain repeats. plo, phi = pos.min(), pos.max() pbins = numpy.bincount(pos - plo) data[a.type + "Count"][plo : phi + 1] += pbins for p in pos: if data[a.type + "Max"] < data[a.type + "Count"][p]: data[a.type + "Max"] = data[a.type + "Count"][p] return data elif isinstance(objList[0], MafBlock): """ the Maf return is a dictionary with the following keys maf all maf block bases maf1e2 maf blocks 100 or greater maf1e3 maf blocks 1,000 or greater maf1e4 maf blocks 10,000 or greater maf1e5 maf blocks 100,000 or greater maf1e6 maf blocks 1,000,000 or greater maf1e7 maf blocks 10,000,000 or greater xAxis x Values mafCpl1eX maf contig paths of X or greater mafCtg1eX maf contigs of X or greater. taken from totalLength field of maf. mafSpl1eX maf scaffold paths of X or greater mafCpEdgeCounts each contig path has two edges, a left and a right mafCpEdgeMax max count mafCpErrorCounts contig paths are made up of segments, segments may have errors at junctions. mafCpErrorMax max count mafSpEdgeCounts Same as above, but for scaffold paths mafSpEdgeMax mafSpErrorCounts mafSpErrorMax blockEdgeCounts each block has two edges, a left and a right blockEdgeMax max count """ from libMafGffPlot import objListUtility_addContigPathEdgeErrors from libMafGffPlot import objListUtility_addBlockEdges from libMafGffPlot import objListUtility_normalizeCategories data = newMafWigDict(numBins) # populate xAxis data["xAxis"] = objListUtility_xAxis(featLen, numBins) for mb in objList: # do block edges objListUtility_addBlockEdges(data, mb, featLen, numBins) # do contige path edges and errors objListUtility_addContigPathEdgeErrors(data, mb, featLen, numBins) # do all of the different maf block flavors objListUtility_mafBlockCounts(data, mb, featLen, numBins) # normalize all categories objListUtility_normalizeCategories(data, featLen, numBins) return data # closing the elif isinstance() checks else: return None