예제 #1
0
def objListUtility_mafBlockCounts(data, mb, featLen, numBins):
    """ Utility function for the MafBlock instance version of 
   libMafGffPlot.objListToBinnedWiggle()
   This is by far the most costly routine in the objList creation process
   """
    from libMafGffPlot import objListUtility_rangeToPos
    import numpy
    length = mb.refEnd - (mb.refStart + 1)

    # tough to follow index hack to get around the fact that numpy will not use
    # += 1 for a list of indices that contain repeats.
    pos = objListUtility_rangeToPos(mb.refStart, mb.refEnd, featLen, numBins)
    plo, phi = pos.min(), pos.max()
    pbins = numpy.bincount(pos - plo)
    data['maf'][plo:phi + 1] += pbins

    for i in xrange(2, 8):
        if length >= 10**i:
            data['maf1e%d' % i][plo:phi + 1] += pbins
        if mb.spl >= 10**i:
            data['mafSpl1e%d' % i][plo:phi + 1] += pbins
        if mb.pairTotalLength >= 10**i:
            data['mafCtg1e%d' % i][plo:phi + 1] += pbins
        if mb.hpl >= 10**i:
            data['mafCpl1e%d' % i][plo:phi + 1] += pbins
예제 #2
0
def objListUtility_mafBlockCounts(data, mb, featLen, numBins):
    """ Utility function for the MafBlock instance version of 
   libMafGffPlot.objListToBinnedWiggle()
   This is by far the most costly routine in the objList creation process
   """
    from libMafGffPlot import objListUtility_rangeToPos
    import numpy

    length = mb.refEnd - (mb.refStart + 1)

    # tough to follow index hack to get around the fact that numpy will not use
    # += 1 for a list of indices that contain repeats.
    pos = objListUtility_rangeToPos(mb.refStart, mb.refEnd, featLen, numBins)
    plo, phi = pos.min(), pos.max()
    pbins = numpy.bincount(pos - plo)
    data["maf"][plo : phi + 1] += pbins

    for i in xrange(2, 8):
        if length >= 10 ** i:
            data["maf1e%d" % i][plo : phi + 1] += pbins
        if mb.spl >= 10 ** i:
            data["mafSpl1e%d" % i][plo : phi + 1] += pbins
        if mb.pairTotalLength >= 10 ** i:
            data["mafCtg1e%d" % i][plo : phi + 1] += pbins
        if mb.hpl >= 10 ** i:
            data["mafCpl1e%d" % i][plo : phi + 1] += pbins
예제 #3
0
def objListToBinnedWiggle(objList, featLen, numBins, filename):
    """ obj can be either a GffRecord object or a MafBlock object.
   featLen is the length of the chromosome.
   returns a numpy vector of length numBins normalized by the maximum
   possible number of bases per bin.
   """
    from libMafGffPlot import GffRecord
    from libMafGffPlot import MafBlock
    from libMafGffPlot import newMafWigDict
    from libMafGffPlot import objListUtility_xAxis
    import numpy
    import sys
    if objList is None or len(objList) < 1:
        return None
    if isinstance(objList[0], GffRecord):
        """ the Gff return is a single numpy vector of numBins length
      """
        data = {}
        # populate xAxis
        data['xAxis'] = objListUtility_xAxis(featLen, numBins)
        annotTypes = set(
            ['CDS', 'UTR', 'NXE', 'NGE', 'island', 'tandem', 'repeat'])
        for t in annotTypes:
            data[t + 'Count'] = numpy.zeros(shape=(numBins))
            data[t + 'Max'] = 0

        for a in objList:
            if a.type not in annotTypes:
                continue
            # verify input
            if a.start > featLen or a.end > featLen:
                sys.stderr.write(
                    'libMafGffPlot.py: file %s has annotation on chr %s '
                    'with bounds [%d - %d] which are beyond featLen (%d)\n' %
                    (filename, a.chr, a.start, a.end, featLen))
                sys.exit(1)
            # index position in a 'numBins' length array.
            pos = objListUtility_rangeToPos(a.start, a.end, featLen, numBins)

            # tough to follow index hack to get around the fact that numpy will not use
            # += 1 for a list of indices that contain repeats.
            plo, phi = pos.min(), pos.max()
            pbins = numpy.bincount(pos - plo)
            data[a.type + 'Count'][plo:phi + 1] += pbins

            for p in pos:
                if data[a.type + 'Max'] < data[a.type + 'Count'][p]:
                    data[a.type + 'Max'] = data[a.type + 'Count'][p]
        return data
    elif isinstance(objList[0], MafBlock):
        """ the Maf return is a dictionary with the following keys
      maf               all maf block bases
      maf1e2            maf blocks 100 or greater
      maf1e3            maf blocks 1,000 or greater
      maf1e4            maf blocks 10,000 or greater
      maf1e5            maf blocks 100,000 or greater
      maf1e6            maf blocks 1,000,000 or greater
      maf1e7            maf blocks 10,000,000 or greater
      xAxis             x Values

      mafCpl1eX         maf contig paths of X or greater

      mafCtg1eX         maf contigs of X or greater. taken from totalLength field of maf.
      
      mafSpl1eX         maf scaffold paths of X or greater
      
      mafCpEdgeCounts   each contig path has two edges, a left and a right
      mafCpEdgeMax      max count
      mafCpErrorCounts  contig paths are made up of segments, segments may have errors at junctions.
      mafCpErrorMax     max count
      mafSpEdgeCounts   Same as above, but for scaffold paths
      mafSpEdgeMax      
      mafSpErrorCounts  
      mafSpErrorMax     
      blockEdgeCounts   each block has two edges, a left and a right
      blockEdgeMax      max count
      
      """
        from libMafGffPlot import objListUtility_addContigPathEdgeErrors
        from libMafGffPlot import objListUtility_addBlockEdges
        from libMafGffPlot import objListUtility_normalizeCategories
        data = newMafWigDict(numBins)

        # populate xAxis
        data['xAxis'] = objListUtility_xAxis(featLen, numBins)
        for mb in objList:
            # do block edges
            objListUtility_addBlockEdges(data, mb, featLen, numBins)

            # do contige path edges and errors
            objListUtility_addContigPathEdgeErrors(data, mb, featLen, numBins)

            # do all of the different maf block flavors
            objListUtility_mafBlockCounts(data, mb, featLen, numBins)

        # normalize all categories
        objListUtility_normalizeCategories(data, featLen, numBins)

        return data
    # closing the elif isinstance() checks
    else:
        return None
예제 #4
0
def objListToBinnedWiggle(objList, featLen, numBins, filename):
    """ obj can be either a GffRecord object or a MafBlock object.
   featLen is the length of the chromosome.
   returns a numpy vector of length numBins normalized by the maximum
   possible number of bases per bin.
   """
    from libMafGffPlot import GffRecord
    from libMafGffPlot import MafBlock
    from libMafGffPlot import newMafWigDict
    from libMafGffPlot import objListUtility_xAxis
    import numpy
    import sys

    if objList is None or len(objList) < 1:
        return None
    if isinstance(objList[0], GffRecord):
        """ the Gff return is a single numpy vector of numBins length
      """
        data = {}
        # populate xAxis
        data["xAxis"] = objListUtility_xAxis(featLen, numBins)
        annotTypes = set(["CDS", "UTR", "NXE", "NGE", "island", "tandem", "repeat"])
        for t in annotTypes:
            data[t + "Count"] = numpy.zeros(shape=(numBins))
            data[t + "Max"] = 0

        for a in objList:
            if a.type not in annotTypes:
                continue
            # verify input
            if a.start > featLen or a.end > featLen:
                sys.stderr.write(
                    "libMafGffPlot.py: file %s has annotation on chr %s "
                    "with bounds [%d - %d] which are beyond featLen (%d)\n" % (filename, a.chr, a.start, a.end, featLen)
                )
                sys.exit(1)
            # index position in a 'numBins' length array.
            pos = objListUtility_rangeToPos(a.start, a.end, featLen, numBins)

            # tough to follow index hack to get around the fact that numpy will not use
            # += 1 for a list of indices that contain repeats.
            plo, phi = pos.min(), pos.max()
            pbins = numpy.bincount(pos - plo)
            data[a.type + "Count"][plo : phi + 1] += pbins

            for p in pos:
                if data[a.type + "Max"] < data[a.type + "Count"][p]:
                    data[a.type + "Max"] = data[a.type + "Count"][p]
        return data
    elif isinstance(objList[0], MafBlock):
        """ the Maf return is a dictionary with the following keys
      maf               all maf block bases
      maf1e2            maf blocks 100 or greater
      maf1e3            maf blocks 1,000 or greater
      maf1e4            maf blocks 10,000 or greater
      maf1e5            maf blocks 100,000 or greater
      maf1e6            maf blocks 1,000,000 or greater
      maf1e7            maf blocks 10,000,000 or greater
      xAxis             x Values

      mafCpl1eX         maf contig paths of X or greater

      mafCtg1eX         maf contigs of X or greater. taken from totalLength field of maf.
      
      mafSpl1eX         maf scaffold paths of X or greater
      
      mafCpEdgeCounts   each contig path has two edges, a left and a right
      mafCpEdgeMax      max count
      mafCpErrorCounts  contig paths are made up of segments, segments may have errors at junctions.
      mafCpErrorMax     max count
      mafSpEdgeCounts   Same as above, but for scaffold paths
      mafSpEdgeMax      
      mafSpErrorCounts  
      mafSpErrorMax     
      blockEdgeCounts   each block has two edges, a left and a right
      blockEdgeMax      max count
      
      """
        from libMafGffPlot import objListUtility_addContigPathEdgeErrors
        from libMafGffPlot import objListUtility_addBlockEdges
        from libMafGffPlot import objListUtility_normalizeCategories

        data = newMafWigDict(numBins)

        # populate xAxis
        data["xAxis"] = objListUtility_xAxis(featLen, numBins)
        for mb in objList:
            # do block edges
            objListUtility_addBlockEdges(data, mb, featLen, numBins)

            # do contige path edges and errors
            objListUtility_addContigPathEdgeErrors(data, mb, featLen, numBins)

            # do all of the different maf block flavors
            objListUtility_mafBlockCounts(data, mb, featLen, numBins)

        # normalize all categories
        objListUtility_normalizeCategories(data, featLen, numBins)

        return data
    # closing the elif isinstance() checks
    else:
        return None