Пример #1
0
    def __init__(self, dbFileName, force=False, scaleFactor=1000):
        # data
        self.dataManager = GMDataManager()  # most data is saved to hdf
        self.dbFileName = dbFileName  # db containing all the data we'd like to use
        self.condition = ""  # condition will be supplied at loading time

        # --> NOTE: ALL of the arrays in this section are in sync
        # --> each one holds information for an individual contig
        self.indices = np_array(
            [])  # indices into the data structure based on condition
        self.covProfiles = np_array([])  # coverage based coordinates
        self.transformedCP = np_array([])  # the munged data points
        self.corners = np_array([])  # the corners of the tranformed space
        self.TCentre = 0.  # the centre of the coverage space
        self.transRadius = 0.  # distance from corner to centre of transformed space
        self.averageCoverages = np_array(
            [])  # average coverage across all stoits
        self.normCoverages = np_array([])  # norm of the raw coverage vectors
        self.kmerSigs = np_array([])  # raw kmer signatures
        self.kmerNormPC1 = np_array(
            [])  # First PC of kmer sigs normalized to [0, 1]
        self.kmerPCs = np_array(
            [])  # PCs of kmer sigs capturing specified variance
        self.kmerVarPC = np_array([])  # variance of each PC
        self.stoitColNames = np_array([])
        self.contigNames = np_array([])
        self.contigLengths = np_array([])
        self.contigGCs = np_array([])
        self.colorMapGC = None

        self.binIds = np_array([])  # list of bin IDs
        # --> end section

        # meta
        self.validBinIds = {}  # valid bin ids -> numMembers
        self.isLikelyChimeric = {
        }  # indicates if a bin is likely to be chimeric
        self.binnedRowIndices = {
        }  # dictionary of those indices which belong to some bin
        self.restrictedRowIndices = {
        }  # dictionary of those indices which can not be binned yet
        self.numContigs = 0  # this depends on the condition given
        self.numStoits = 0  # this depends on the data which was parsed

        # contig links
        self.links = {}

        # misc
        self.forceWriting = force  # overwrite existng values silently?
        self.scaleFactor = scaleFactor  # scale every thing in the transformed data to this dimension
Пример #2
0
    def __init__(self, dbFileName, force=False, scaleFactor=1000):
        # data
        self.dataManager = GMDataManager()  # most data is saved to hdf
        self.dbFileName = dbFileName        # db containing all the data we'd like to use
        self.condition = ""                 # condition will be supplied at loading time
        # --> NOTE: ALL of the arrays in this section are in sync
        # --> each one holds information for an individual contig 
        self.indices = np_array([])        # indices into the data structure based on condition
        self.covProfiles = np_array([])     # coverage based coordinates
        self.transformedCP = np_array([])   # the munged data points
        self.averageCoverages = np_array([]) # average coverage across all stoits
        self.kmerSigs = np_array([])        # raw kmer signatures
        self.kmerVals = np_array([])        # PCA'd kmer sigs

        self.contigNames = np_array([])
        self.contigLengths = np_array([])
        self.contigColours = np_array([])   # calculated from kmerVals
        
        self.binIds = np_array([])          # list of bin IDs
        # --> end section

        # meta                
        self.validBinIds = {}               # valid bin ids -> numMembers
        self.binnedRowIndicies = {}         # dictionary of those indices which belong to some bin
        self.restrictedRowIndicies = {}     # dictionary of those indices which can not be binned yet
        self.numContigs = 0                 # this depends on the condition given
        self.numStoits = 0                  # this depends on the data which was parsed

        # contig links
        self.links = {}
        
        # misc
        self.forceWriting = force           # overwrite existng values silently?
        self.scaleFactor = scaleFactor      # scale every thing in the transformed data to this dimension
Пример #3
0
    def __init__(self, dbFileName, force=False, scaleFactor=1000):
        # data
        self.dataManager = GMDataManager()  # most data is saved to hdf
        self.dbFileName = dbFileName        # db containing all the data we'd like to use
        self.condition = ""                 # condition will be supplied at loading time

        # --> NOTE: ALL of the arrays in this section are in sync
        # --> each one holds information for an individual contig
        self.indices = np_array([])         # indices into the data structure based on condition
        self.covProfiles = np_array([])     # coverage based coordinates
        self.transformedCP = np_array([])   # the munged data points
        self.corners = np_array([])         # the corners of the tranformed space
        self.TCentre = 0.                   # the centre of the coverage space
        self.transRadius = 0.               # distance from corner to centre of transformed space
        self.averageCoverages = np_array([])# average coverage across all stoits
        self.normCoverages = np_array([])   # norm of the raw coverage vectors
        self.kmerSigs = np_array([])        # raw kmer signatures
        self.kmerNormPC1 = np_array([])     # First PC of kmer sigs normalized to [0, 1]
        self.kmerPCs = np_array([])         # PCs of kmer sigs capturing specified variance
        self.kmerVarPC = np_array([])       # variance of each PC
        self.stoitColNames = np_array([])
        self.contigNames = np_array([])
        self.contigLengths = np_array([])
        self.contigGCs = np_array([])
        self.colorMapGC = None

        self.binIds = np_array([])          # list of bin IDs
        # --> end section

        # meta
        self.validBinIds = {}               # valid bin ids -> numMembers
        self.isLikelyChimeric = {}          # indicates if a bin is likely to be chimeric
        self.binnedRowIndices = {}          # dictionary of those indices which belong to some bin
        self.restrictedRowIndices = {}      # dictionary of those indices which can not be binned yet
        self.numContigs = 0                 # this depends on the condition given
        self.numStoits = 0                  # this depends on the data which was parsed

        # contig links
        self.links = {}

        # misc
        self.forceWriting = force           # overwrite existng values silently?
        self.scaleFactor = scaleFactor      # scale every thing in the transformed data to this dimension
Пример #4
0
    def parseOptions(self, options ):
        timer = gtime.TimeKeeper()
        if(options.subparser_name == 'parse'):
            # parse raw input
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in data parsing mode..." % self.GMVersion
            print "*******************************************************************************"
            # check this here:
            if len(options.bamfiles) < 3:
                print "Sorry, You must supply at least 3 bamFiles to use GroopM. (You supplied %d)\n Exiting..." % len(options.bamfiles)
                return
            GMdata = mstore.GMDataManager()
            success = GMdata.createDB(options.bamfiles,
                                      options.reference,
                                      options.dbname,
                                      options.cutoff,
                                      timer,
                                      force=options.force,
                                      threads=options.threads)
            if not success:
                print options.dbname,"not updated"

        elif(options.subparser_name == 'core'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in core creation mode..." % self.GMVersion
            print "*******************************************************************************"
            CE = cluster.ClusterEngine(options.dbname,
                                       timer,
                                       force=options.force,
                                       finalPlot=options.plot,
                                       plot=options.multiplot,
                                       minSize=options.size,
                                       minVol=options.bp)
            if options.graphfile is None:
                gf = ""
            else:
                gf=options.graphfile
            CE.makeCores(coreCut=options.cutoff,
                         gf=gf)

        elif(options.subparser_name == 'refine'):
            # refine bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in core refining mode..." % self.GMVersion
            print "*******************************************************************************"
            bids = []
            #if options.bids is not None:
            #    bids = options.bids
            auto = options.auto
            transform=True^options.no_transform

            RE = refine.RefineEngine(timer,
                                     dbFileName=options.dbname,
                                     transform=transform,
                                     bids=bids,
                                     loadContigNames=True)

            if options.plot:
                pfx="REFINED"
            else:
                pfx=""
            print "Refine bins"

            RE.refineBins(timer,
                          auto=auto,
                          saveBins=True,
                          plotFinal=pfx)

        elif(options.subparser_name == 'recruit'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin expansion mode..." % self.GMVersion
            print "*******************************************************************************"
            RE = refine.RefineEngine(timer,
                                     dbFileName=options.dbname,
                                     getUnbinned=True,
                                     loadContigNames=False,
                                     cutOff=options.cutoff)

            RE.recruitWrapper(timer,
                              inclusivity=options.inclusivity,
                              step=options.step,
                              saveBins=True)

        elif(options.subparser_name == 'extract'):
            # Extract data
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in '%s' extraction mode..." % (self.GMVersion, options.mode)
            print "*******************************************************************************"
            bids = []
            if options.bids is not None:
                bids = options.bids
            BX = groopmUtils.GMExtractor(options.dbname,
                                          bids=bids,
                                          folder=options.out_folder
                                          )
            if(options.mode=='contigs'):
                BX.extractContigs(timer,
                                  fasta=options.data,
                                  prefix=options.prefix,
                                  cutoff=options.cutoff)

            elif(options.mode=='reads'):
                BX.extractReads(timer,
                                bams=options.data,
                                prefix=options.prefix,
                                mixBams=options.mix_bams,
                                mixGroups=options.mix_groups,
                                mixReads=options.mix_reads,
                                interleaved=options.interleave,
                                bigFile=options.no_gzip,
                                headersOnly=options.headers_only,
                                minMapQual=options.mapping_quality,
                                maxMisMatches=options.max_distance,
                                useSuppAlignments=options.use_supplementary,
                                useSecondaryAlignments=options.use_secondary,
                                verbose=options.verbose,
                                threads=options.threads)

            else:
                raise ExtractModeNotAppropriateException("mode: "+ options.mode + " is unknown")
        elif(options.subparser_name == 'merge'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin merging mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)
            BM.loadBins(timer, makeBins=True, silent=False)
            BM.merge(options.bids, options.force, saveBins=True)

        elif(options.subparser_name == 'split'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin splitting mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)
            BM.loadBins(timer, makeBins=True, silent=False)
            BM.split(options.bid, options.parts, mode=options.mode, saveBins=True, auto=options.force)

        elif(options.subparser_name == 'delete'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin deleting mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)
            BM.loadBins(timer, makeBins=True, silent=True)#, bids=options.bids)
            BM.deleteBins(options.bids, force=options.force, saveBins=True, freeBinnedRowIndices=True)

        elif(options.subparser_name == 'plot'):
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin plotting mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)

            if options.bids is None:
                bids = []
            else:
                bids = options.bids
            BM.loadBins(timer, makeBins=True, silent=False, bids=bids, loadContigNames=False)

            BM.setColorMap(options.cm)

            BM.plotBins(FNPrefix=options.tag,
                        plotEllipsoid=True,
                        ignoreContigLengths=options.points,
                        folder=options.folder)

        elif(options.subparser_name == 'explore'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin '%s' explorer mode..." % (self.GMVersion, options.mode)
            print "*******************************************************************************"
            transform=True^options.no_transform
            bids = []
            if options.bids is not None:
                bids = options.bids
            BE = groopmUtils.BinExplorer(options.dbname,
                                         bids=bids,
                                         transform=transform,
                                         cmstring=options.cm,
                                         ignoreContigLengths=options.points)
            if(options.mode == 'binpoints'):
                BE.plotPoints(timer)
            elif(options.mode == 'binids'):
                BE.plotIds(timer)
            elif(options.mode == 'allcontigs'):
                BE.plotContigs(timer, coreCut=options.cutoff, all=True)
            elif(options.mode == 'unbinnedcontigs'):
                BE.plotUnbinned(timer, coreCut=options.cutoff)
            elif(options.mode == 'binnedcontigs'):
                BE.plotContigs(timer, coreCut=options.cutoff)
            elif(options.mode == 'binassignments'):
                BE.plotBinAssignents(timer, coreCut=options.cutoff)
            elif(options.mode == 'compare'):
                BE.plotCompare(timer, coreCut=options.cutoff)
            elif (options.mode == 'together'):
                BE.plotTogether(timer, coreCut=options.cutoff, doMers=options.kmers)
            elif (options.mode == 'sidebyside'):
                BE.plotSideBySide(timer, coreCut=options.cutoff)
            else:
                print "**Error: unknown mode:",options.mode

        elif(options.subparser_name == 'flyover'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Making a flyover..." % self.GMVersion
            print "*******************************************************************************"
            bids = []
            if options.bids is not None:
                bids = options.bids
            BE = groopmUtils.BinExplorer(options.dbname,
                                         bids=bids,
                                         transform=True,
                                         ignoreContigLengths=options.points)
            BE.plotFlyOver(timer,
                           fps=options.fps,
                           totalTime=options.totalTime,
                           percentFade=options.firstFade,
                           prefix=options.prefix,
                           showColorbar=options.colorbar,
                           title=options.title,
                           coreCut=options.cutoff,
                           format=options.format)

        elif(options.subparser_name == 'highlight'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in highlighter mode..." % self.GMVersion
            print "*******************************************************************************"
            bids = []
            if options.bids is not None:
                bids = options.bids
            BE = groopmUtils.BinExplorer(options.dbname,
                                         bids=bids,
                                         binLabelsFile = options.binlabels,
                                         contigColorsFile = options.contigcolors,
                                         ignoreContigLengths=options.points)
            BE.plotHighlights(timer,
                              options.elevation,
                              options.azimuth,
                              options.file,
                              options.filetype,
                              options.dpi,
                              drawRadius=options.radius,
                              show=options.show,
                              coreCut=options.cutoff,
                              testing=options.place
                              )

        elif(options.subparser_name == 'print'):
            BM = binManager.BinManager(dbFileName=options.dbname)
            bids = []
            if options.bids is not None:
                bids = options.bids
            BM.loadBins(timer, getUnbinned=options.unbinned, makeBins=True, silent=True, bids=bids)
            BM.printBins(options.format, fileName=options.outfile)

        elif(options.subparser_name == 'dump'):
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in data dumping mode..." % self.GMVersion
            print "*******************************************************************************"

            # prep fields. Do this first cause users are mot likely to
            # mess this part up!
            allowable_fields = ['names', 'mers', 'gc', 'coverage', 'tcoverage', 'ncoverage', 'lengths', 'bins', 'all']
            fields = options.fields.split(',')
            for field in fields:
                if field not in allowable_fields:
                    print "ERROR: field '%s' not recognised. Allowable fields are:" % field
                    print '\t',",".join(allowable_fields)
                    return
            if options.separator == '\\t':
                separator = '\t'
            else:
                separator = options.separator

            DM = GMDataManager()
            DM.dumpData(options.dbname,
                        fields,
                        options.outfile,
                        separator,
                        not options.no_headers)

        return 0
Пример #5
0
    def parseOptions(self, options):
        timer = gtime.TimeKeeper()
        if (options.subparser_name == 'parse'):
            # parse raw input
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in data parsing mode..." % self.GMVersion
            print "*******************************************************************************"
            # check this here:
            if len(options.bamfiles) < 3:
                print "Sorry, You must supply at least 3 bamFiles to use GroopM. (You supplied %d)\n Exiting..." % len(
                    options.bamfiles)
                return
            GMdata = mstore.GMDataManager()
            success = GMdata.createDB(options.bamfiles,
                                      options.reference,
                                      options.dbname,
                                      options.cutoff,
                                      timer,
                                      force=options.force,
                                      threads=options.threads)
            if not success:
                print options.dbname, "not updated"

        elif (options.subparser_name == 'core'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in core creation mode..." % self.GMVersion
            print "*******************************************************************************"
            CE = cluster.ClusterEngine(options.dbname,
                                       timer,
                                       force=options.force,
                                       finalPlot=options.plot,
                                       plot=options.multiplot,
                                       minSize=options.size,
                                       minVol=options.bp)
            if options.graphfile is None:
                gf = ""
            else:
                gf = options.graphfile
            CE.makeCores(coreCut=options.cutoff, gf=gf)

        elif (options.subparser_name == 'refine'):
            # refine bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in core refining mode..." % self.GMVersion
            print "*******************************************************************************"
            bids = []
            #if options.bids is not None:
            #    bids = options.bids
            auto = options.auto
            transform = True ^ options.no_transform

            RE = refine.RefineEngine(timer,
                                     dbFileName=options.dbname,
                                     transform=transform,
                                     bids=bids,
                                     loadContigNames=True)

            if options.plot:
                pfx = "REFINED"
            else:
                pfx = ""
            print "Refine bins"

            RE.refineBins(timer, auto=auto, saveBins=True, plotFinal=pfx)

        elif (options.subparser_name == 'recruit'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin expansion mode..." % self.GMVersion
            print "*******************************************************************************"
            RE = refine.RefineEngine(timer,
                                     dbFileName=options.dbname,
                                     getUnbinned=True,
                                     loadContigNames=False,
                                     cutOff=options.cutoff)

            RE.recruitWrapper(timer,
                              inclusivity=options.inclusivity,
                              step=options.step,
                              saveBins=True)

        elif (options.subparser_name == 'extract'):
            # Extract data
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in '%s' extraction mode..." % (
                self.GMVersion, options.mode)
            print "*******************************************************************************"
            bids = []
            if options.bids is not None:
                bids = options.bids
            BX = groopmUtils.GMExtractor(options.dbname,
                                         bids=bids,
                                         folder=options.out_folder)
            if (options.mode == 'contigs'):
                BX.extractContigs(timer,
                                  fasta=options.data,
                                  prefix=options.prefix,
                                  cutoff=options.cutoff)

            elif (options.mode == 'reads'):
                BX.extractReads(timer,
                                bams=options.data,
                                prefix=options.prefix,
                                mixBams=options.mix_bams,
                                mixGroups=options.mix_groups,
                                mixReads=options.mix_reads,
                                interleaved=options.interleave,
                                bigFile=options.no_gzip,
                                headersOnly=options.headers_only,
                                minMapQual=options.mapping_quality,
                                maxMisMatches=options.max_distance,
                                useSuppAlignments=options.use_supplementary,
                                useSecondaryAlignments=options.use_secondary,
                                verbose=options.verbose,
                                threads=options.threads)

            else:
                raise ExtractModeNotAppropriateException("mode: " +
                                                         options.mode +
                                                         " is unknown")
        elif (options.subparser_name == 'merge'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin merging mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)
            BM.loadBins(timer, makeBins=True, silent=False)
            BM.merge(options.bids, options.force, saveBins=True)

        elif (options.subparser_name == 'split'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin splitting mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)
            BM.loadBins(timer, makeBins=True, silent=False)
            BM.split(options.bid,
                     options.parts,
                     mode=options.mode,
                     saveBins=True,
                     auto=options.force)

        elif (options.subparser_name == 'delete'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin deleting mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)
            BM.loadBins(timer, makeBins=True,
                        silent=True)  #, bids=options.bids)
            BM.deleteBins(options.bids,
                          force=options.force,
                          saveBins=True,
                          freeBinnedRowIndices=True)

        elif (options.subparser_name == 'plot'):
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin plotting mode..." % self.GMVersion
            print "*******************************************************************************"
            BM = binManager.BinManager(dbFileName=options.dbname)

            if options.bids is None:
                bids = []
            else:
                bids = options.bids
            BM.loadBins(timer,
                        makeBins=True,
                        silent=False,
                        bids=bids,
                        loadContigNames=False)

            BM.setColorMap(options.cm)

            BM.plotBins(FNPrefix=options.tag,
                        plotEllipsoid=True,
                        ignoreContigLengths=options.points,
                        folder=options.folder)

        elif (options.subparser_name == 'explore'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in bin '%s' explorer mode..." % (
                self.GMVersion, options.mode)
            print "*******************************************************************************"
            transform = True ^ options.no_transform
            bids = []
            if options.bids is not None:
                bids = options.bids
            BE = groopmUtils.BinExplorer(options.dbname,
                                         bids=bids,
                                         transform=transform,
                                         cmstring=options.cm,
                                         ignoreContigLengths=options.points)
            if (options.mode == 'binpoints'):
                BE.plotPoints(timer)
            elif (options.mode == 'binids'):
                BE.plotIds(timer)
            elif (options.mode == 'allcontigs'):
                BE.plotContigs(timer, coreCut=options.cutoff, all=True)
            elif (options.mode == 'unbinnedcontigs'):
                BE.plotUnbinned(timer, coreCut=options.cutoff)
            elif (options.mode == 'binnedcontigs'):
                BE.plotContigs(timer, coreCut=options.cutoff)
            elif (options.mode == 'binassignments'):
                BE.plotBinAssignents(timer, coreCut=options.cutoff)
            elif (options.mode == 'compare'):
                BE.plotCompare(timer, coreCut=options.cutoff)
            elif (options.mode == 'together'):
                BE.plotTogether(timer,
                                coreCut=options.cutoff,
                                doMers=options.kmers)
            elif (options.mode == 'sidebyside'):
                BE.plotSideBySide(timer, coreCut=options.cutoff)
            else:
                print "**Error: unknown mode:", options.mode

        elif (options.subparser_name == 'flyover'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Making a flyover..." % self.GMVersion
            print "*******************************************************************************"
            bids = []
            if options.bids is not None:
                bids = options.bids
            BE = groopmUtils.BinExplorer(options.dbname,
                                         bids=bids,
                                         transform=True,
                                         ignoreContigLengths=options.points)
            BE.plotFlyOver(timer,
                           fps=options.fps,
                           totalTime=options.totalTime,
                           percentFade=options.firstFade,
                           prefix=options.prefix,
                           showColorbar=options.colorbar,
                           title=options.title,
                           coreCut=options.cutoff,
                           format=options.format)

        elif (options.subparser_name == 'highlight'):
            # make bin cores
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in highlighter mode..." % self.GMVersion
            print "*******************************************************************************"
            bids = []
            if options.bids is not None:
                bids = options.bids
            BE = groopmUtils.BinExplorer(options.dbname,
                                         bids=bids,
                                         binLabelsFile=options.binlabels,
                                         contigColorsFile=options.contigcolors,
                                         ignoreContigLengths=options.points)
            BE.plotHighlights(timer,
                              options.elevation,
                              options.azimuth,
                              options.file,
                              options.filetype,
                              options.dpi,
                              drawRadius=options.radius,
                              show=options.show,
                              coreCut=options.cutoff,
                              testing=options.place)

        elif (options.subparser_name == 'print'):
            BM = binManager.BinManager(dbFileName=options.dbname)
            bids = []
            if options.bids is not None:
                bids = options.bids
            BM.loadBins(timer,
                        getUnbinned=options.unbinned,
                        makeBins=True,
                        silent=True,
                        bids=bids)
            BM.printBins(options.format, fileName=options.outfile)

        elif (options.subparser_name == 'dump'):
            print "*******************************************************************************"
            print " [[GroopM %s]] Running in data dumping mode..." % self.GMVersion
            print "*******************************************************************************"

            # prep fields. Do this first cause users are mot likely to
            # mess this part up!
            allowable_fields = [
                'names', 'mers', 'gc', 'coverage', 'tcoverage', 'ncoverage',
                'lengths', 'bins', 'all'
            ]
            fields = options.fields.split(',')
            for field in fields:
                if field not in allowable_fields:
                    print "ERROR: field '%s' not recognised. Allowable fields are:" % field
                    print '\t', ",".join(allowable_fields)
                    return
            if options.separator == '\\t':
                separator = '\t'
            else:
                separator = options.separator

            DM = GMDataManager()
            DM.dumpData(options.dbname, fields, options.outfile, separator,
                        not options.no_headers)

        return 0
Пример #6
0
class ProfileManager:
    """Interacts with the groopm DataManager and local data fields

    Mostly a wrapper around a group of numpy arrays and a pytables quagmire
    """
    def __init__(self, dbFileName, force=False, scaleFactor=1000):
        # data
        self.dataManager = GMDataManager()  # most data is saved to hdf
        self.dbFileName = dbFileName  # db containing all the data we'd like to use
        self.condition = ""  # condition will be supplied at loading time

        # --> NOTE: ALL of the arrays in this section are in sync
        # --> each one holds information for an individual contig
        self.indices = np_array(
            [])  # indices into the data structure based on condition
        self.covProfiles = np_array([])  # coverage based coordinates
        self.transformedCP = np_array([])  # the munged data points
        self.corners = np_array([])  # the corners of the tranformed space
        self.TCentre = 0.  # the centre of the coverage space
        self.transRadius = 0.  # distance from corner to centre of transformed space
        self.averageCoverages = np_array(
            [])  # average coverage across all stoits
        self.normCoverages = np_array([])  # norm of the raw coverage vectors
        self.kmerSigs = np_array([])  # raw kmer signatures
        self.kmerNormPC1 = np_array(
            [])  # First PC of kmer sigs normalized to [0, 1]
        self.kmerPCs = np_array(
            [])  # PCs of kmer sigs capturing specified variance
        self.kmerVarPC = np_array([])  # variance of each PC
        self.stoitColNames = np_array([])
        self.contigNames = np_array([])
        self.contigLengths = np_array([])
        self.contigGCs = np_array([])
        self.colorMapGC = None

        self.binIds = np_array([])  # list of bin IDs
        # --> end section

        # meta
        self.validBinIds = {}  # valid bin ids -> numMembers
        self.isLikelyChimeric = {
        }  # indicates if a bin is likely to be chimeric
        self.binnedRowIndices = {
        }  # dictionary of those indices which belong to some bin
        self.restrictedRowIndices = {
        }  # dictionary of those indices which can not be binned yet
        self.numContigs = 0  # this depends on the condition given
        self.numStoits = 0  # this depends on the data which was parsed

        # contig links
        self.links = {}

        # misc
        self.forceWriting = force  # overwrite existng values silently?
        self.scaleFactor = scaleFactor  # scale every thing in the transformed data to this dimension

    def loadData(
            self,
            timer,
            condition,  # condition as set by another function
            bids=[],  # if this is set then only load those contigs with these bin ids
            verbose=True,  # many to some output messages
            silent=False,  # some to no output messages
            loadCovProfiles=True,
            loadKmerPCs=True,
            loadKmerVarPC=True,
            loadRawKmers=False,
            makeColors=True,
            loadContigNames=True,
            loadContigLengths=True,
            loadContigGCs=True,
            loadBins=False,
            loadLinks=False):
        """Load pre-parsed data"""

        timer.getTimeStamp()
        if (silent):
            verbose = False
        if verbose:
            print("Loading data from:", self.dbFileName)

        try:
            self.numStoits = self.getNumStoits()
            self.condition = condition
            self.indices = self.dataManager.getConditionalIndices(
                self.dbFileName, condition=condition, silent=silent)
            if (verbose):
                print("    Loaded indices with condition:", condition)
            self.numContigs = len(self.indices)

            if self.numContigs == 0:
                print("    ERROR: No contigs loaded using condition:",
                      condition)
                return

            if (not silent):
                print("    Working with: %d contigs" % self.numContigs)

            if (loadCovProfiles):
                if (verbose):
                    print("    Loading coverage profiles")
                self.covProfiles = self.dataManager.getCoverageProfiles(
                    self.dbFileName, indices=self.indices)
                self.normCoverages = self.dataManager.getNormalisedCoverageProfiles(
                    self.dbFileName, indices=self.indices)

                # work out average coverages
                self.averageCoverages = np_array(
                    [sum(i) / self.numStoits for i in self.covProfiles])

            if loadRawKmers:
                if (verbose):
                    print("    Loading RAW kmer sigs")
                self.kmerSigs = self.dataManager.getKmerSigs(
                    self.dbFileName, indices=self.indices)

            if (loadKmerPCs):
                self.kmerPCs = self.dataManager.getKmerPCAs(
                    self.dbFileName, indices=self.indices)

                if (verbose):
                    print("    Loading PCA kmer sigs (" +
                          str(len(self.kmerPCs[0])) + " dimensional space)")

                self.kmerNormPC1 = np_copy(self.kmerPCs[:, 0])
                self.kmerNormPC1 -= np_min(self.kmerNormPC1)
                self.kmerNormPC1 /= np_max(self.kmerNormPC1)

            if (loadKmerVarPC):
                self.kmerVarPC = self.dataManager.getKmerVarPC(
                    self.dbFileName, indices=self.indices)

                if (verbose):
                    print(
                        "    Loading PCA kmer variance (total variance: %.2f" %
                        np_sum(self.kmerVarPC) + ")")

            if (loadContigNames):
                if (verbose):
                    print("    Loading contig names")
                self.contigNames = self.dataManager.getContigNames(
                    self.dbFileName, indices=self.indices)

            if (loadContigLengths):
                self.contigLengths = self.dataManager.getContigLengths(
                    self.dbFileName, indices=self.indices)
                if (verbose):
                    print("    Loading contig lengths (Total: %d BP)" %
                          (sum(self.contigLengths)))

            if (loadContigGCs):
                self.contigGCs = self.dataManager.getContigGCs(
                    self.dbFileName, indices=self.indices)
                if (verbose):
                    print("    Loading contig GC ratios (Average GC: %0.3f)" %
                          (np_mean(self.contigGCs)))

            if (makeColors):
                if (verbose):
                    print("    Creating color map")

                # use HSV to RGB to generate colors
                S = 1  # SAT and VAL remain fixed at 1. Reduce to make
                V = 1  # Pastels if that's your preference...
                self.colorMapGC = self.createColorMapHSV()

            if (loadBins):
                if (verbose):
                    print("    Loading bin assignments")

                self.binIds = self.dataManager.getBins(self.dbFileName,
                                                       indices=self.indices)

                if len(
                        bids
                ) != 0:  # need to make sure we're not restricted in terms of bins
                    bin_stats = self.getBinStats()
                    for bid in bids:
                        try:
                            self.validBinIds[bid] = bin_stats[bid][0]
                            self.isLikelyChimeric[bid] = bin_stats[bid][1]
                        except KeyError:
                            self.validBinIds[bid] = 0
                            self.isLikelyChimeric[bid] = False

                else:
                    bin_stats = self.getBinStats()
                    for bid in bin_stats:
                        self.validBinIds[bid] = bin_stats[bid][0]
                        self.isLikelyChimeric[bid] = bin_stats[bid][1]

                # fix the binned indices
                self.binnedRowIndices = {}
                for i in range(len(self.indices)):
                    if (self.binIds[i] != 0):
                        self.binnedRowIndices[i] = True
            else:
                # we need zeros as bin indicies then...
                self.binIds = np_zeros(len(self.indices))

            if (loadLinks):
                self.loadLinks()

            self.stoitColNames = self.getStoitColNames()

        except:
            print("Error loading DB:", self.dbFileName, exc_info()[0])
            raise

    def reduceIndices(self, deadRowIndices):
        """purge indices from the data structures

        Be sure that deadRowIndices are sorted ascending
        """
        # strip out the other values
        self.indices = np_delete(self.indices, deadRowIndices, axis=0)
        self.covProfiles = np_delete(self.covProfiles, deadRowIndices, axis=0)
        self.transformedCP = np_delete(self.transformedCP,
                                       deadRowIndices,
                                       axis=0)
        self.contigNames = np_delete(self.contigNames, deadRowIndices, axis=0)
        self.contigLengths = np_delete(self.contigLengths,
                                       deadRowIndices,
                                       axis=0)
        self.contigGCs = np_delete(self.contigGCs, deadRowIndices, axis=0)
        #self.kmerSigs = np_delete(self.kmerSigs, deadRowIndices, axis=0)
        self.kmerPCs = np_delete(self.kmerPCs, deadRowIndices, axis=0)
        self.binIds = np_delete(self.binIds, deadRowIndices, axis=0)

#------------------------------------------------------------------------------
# GET / SET

    def getNumStoits(self):
        """return the value of numStoits in the metadata tables"""
        return self.dataManager.getNumStoits(self.dbFileName)

    def getMerColNames(self):
        """return the value of merColNames in the metadata tables"""
        return self.dataManager.getMerColNames(self.dbFileName)

    def getMerSize(self):
        """return the value of merSize in the metadata tables"""
        return self.dataManager.getMerSize(self.dbFileName)

    def getNumMers(self):
        """return the value of numMers in the metadata tables"""
        return self.dataManager.getNumMers(self.dbFileName)

### USE the member vars instead!
#    def getNumCons(self):
#        """return the value of numCons in the metadata tables"""
#        return self.dataManager.getNumCons(self.dbFileName)

    def getNumBins(self):
        """return the value of numBins in the metadata tables"""
        return self.dataManager.getNumBins(self.dbFileName)

    def setNumBins(self, numBins):
        """set the number of bins"""
        self.dataManager.setNumBins(self.dbFileName, numBins)

    def getStoitColNames(self):
        """return the value of stoitColNames in the metadata tables"""
        return np_array(
            self.dataManager.getStoitColNames(self.dbFileName).split(","))

    def isClustered(self):
        """Has the data been clustered already"""
        return self.dataManager.isClustered(self.dbFileName)

    def setClustered(self):
        """Save that the db has been clustered"""
        self.dataManager.setClustered(self.dbFileName, True)

    def isComplete(self):
        """Has the data been *completely* clustered already"""
        return self.dataManager.isComplete(self.dbFileName)

    def setComplete(self):
        """Save that the db has been completely clustered"""
        self.dataManager.setComplete(self.dbFileName, True)

    def getBinStats(self):
        """Go through all the "bins" array and make a list of unique bin ids vs number of contigs"""
        return self.dataManager.getBinStats(self.dbFileName)

    def setBinStats(self, binStats):
        """Store the valid bin Ids and number of members

        binStats is a list of tuples which looks like:
        [ (bid, numMembers, isLikelyChimeric) ]
        Note that this call effectively nukes the existing table
        """
        self.dataManager.setBinStats(self.dbFileName, binStats)
        self.setNumBins(len(binStats))

    def setBinAssignments(self, assignments, nuke=False):
        """Save our bins into the DB"""
        self.dataManager.setBinAssignments(self.dbFileName,
                                           assignments,
                                           nuke=nuke)

    def loadLinks(self):
        """Extra wrapper 'cause I am dumb"""
        self.links = self.getLinks()

    def getLinks(self):
        """Get contig links"""
        # first we get the absolute links
        absolute_links = self.dataManager.restoreLinks(self.dbFileName,
                                                       self.indices)
        # now convert this into plain old row_indices
        reverse_index_lookup = {}
        for i in range(len(self.indices)):
            reverse_index_lookup[self.indices[i]] = i

        # now convert the absolute links to local ones
        relative_links = {}
        for cid in self.indices:
            local_cid = reverse_index_lookup[cid]
            relative_links[local_cid] = []
            try:
                for link in absolute_links[cid]:
                    relative_links[local_cid].append([
                        reverse_index_lookup[link[0]], link[1], link[2],
                        link[3]
                    ])
            except KeyError:  # not everyone is linked
                pass

        return relative_links

#------------------------------------------------------------------------------
# DATA TRANSFORMATIONS

    def getAverageCoverage(self, rowIndex):
        """Return the average coverage for this contig across all stoits"""
        return sum(self.transformedCP[rowIndex]) / self.numStoits

    def shuffleBAMs(self):
        """Make the data transformation deterministic by reordering the bams"""
        # first we should make a subset of the total data
        # we'd like to take it down to about 1500 or so RI's
        # but we'd like to do this in a repeatable way
        ideal_contig_num = 1500
        sub_cons = range(len(self.indices))
        while len(sub_cons) > ideal_contig_num:
            # select every second contig when sorted by norm cov
            cov_sorted = np_argsort(self.normCoverages[sub_cons])
            sub_cons = np_array([
                sub_cons[cov_sorted[i * 2]]
                for i in np_arange(int(len(sub_cons) / 2))
            ])

            if len(sub_cons) > ideal_contig_num:
                # select every second contig when sorted by mer PC1
                mer_sorted = np_argsort(self.kmerNormPC1[sub_cons])
                sub_cons = np_array([
                    sub_cons[mer_sorted[i * 2]]
                    for i in np_arange(int(len(sub_cons) / 2))
                ])

        # now that we have a subset, calculate the distance between each of the untransformed vectors
        num_sc = len(sub_cons)

        # log shift the coverages towards the origin
        sub_covs = np_transpose([
            self.covProfiles[i] *
            (np_log10(self.normCoverages[i]) / self.normCoverages[i])
            for i in sub_cons
        ])
        sq_dists = cdist(sub_covs, sub_covs, 'cityblock')
        dists = squareform(sq_dists)

        # initialise a list of left, right neighbours
        lr_dict = {}
        for i in range(self.numStoits):
            lr_dict[i] = []

        too_big = 10000
        while True:
            closest = np_argmin(dists)
            if dists[closest] == too_big:
                break
            (i, j) = self.small2indices(closest, self.numStoits - 1)
            lr_dict[j].append(i)
            lr_dict[i].append(j)

            # mark these guys as neighbours
            if len(lr_dict[i]) == 2:
                # no more than 2 neighbours
                sq_dists[i, :] = too_big
                sq_dists[:, i] = too_big
                sq_dists[i, i] = 0.0
            if len(lr_dict[j]) == 2:
                # no more than 2 neighbours
                sq_dists[j, :] = too_big
                sq_dists[:, j] = too_big
                sq_dists[j, j] = 0.0

            # fix the dist matrix
            sq_dists[j, i] = too_big
            sq_dists[i, j] = too_big
            dists = squareform(sq_dists)

        # now make the ordering
        ordering = [0, lr_dict[0][0]]
        done = 2
        while done < self.numStoits:
            last = ordering[done - 1]
            if lr_dict[last][0] == ordering[done - 2]:
                ordering.append(lr_dict[last][1])
                last = lr_dict[last][1]
            else:
                ordering.append(lr_dict[last][0])
                last = lr_dict[last][0]
            done += 1

        # reshuffle the contig order!
        # yay for bubble sort!
        working = np_arange(self.numStoits)
        for i in range(1, self.numStoits):
            # where is this guy in the list
            loc = list(working).index(ordering[i])
            if loc != i:
                # swap the columns
                self.covProfiles[:, [i, loc]] = self.covProfiles[:, [loc, i]]
                self.stoitColNames[[i, loc]] = self.stoitColNames[[loc, i]]
                working[[i, loc]] = working[[loc, i]]

    def transformCP(self, timer, silent=False, nolog=False):
        """Do the main transformation on the coverage profile data"""
        if (not silent):
            print("    Reticulating splines")
        self.transformedCP = self.dataManager.getTransformedCoverageProfiles(
            self.dbFileName, indices=self.indices)
        self.corners = self.dataManager.getTransformedCoverageCorners(
            self.dbFileName)
        self.TCentre = np_mean(self.corners, axis=0)
        self.transRadius = np_norm(self.corners[0] - self.TCentre)

#------------------------------------------------------------------------------
# DEBUG CRUFT

    def rewriteBins(self):
        """rewrite the bins table in hdf5 based on numbers in meta-contigs"""
        bins = self.dataManager.getBins(self.dbFileName)
        bin_store = {}
        for c in bins:
            if c != 0:
                try:
                    bin_store[c] += 1
                except KeyError:
                    bin_store[c] = 1

        bin_stats = []
        for bid in bin_store:
            # [(bid, size, likelyChimeric)]
            bin_stats.append((bid, bin_store[bid], False))

        self.setBinStats(bin_stats)

#------------------------------------------------------------------------------
# IO and IMAGE RENDERING

    def createColorMapHSV(self):
        S = 1.0
        V = 1.0
        return LinearSegmentedColormap.from_list('GC', [
            htr((1.0 + np_sin(np_pi * (val / 1000.0) - np_pi / 2)) / 2., S, V)
            for val in xrange(0, 1000)
        ],
                                                 N=1000)

    def setColorMap(self, colorMapStr):
        if colorMapStr == 'HSV':
            S = 1
            V = 1
            self.colorMapGC = self.createColorMapHSV()
        elif colorMapStr == 'Accent':
            self.colorMapGC = get_cmap('Accent')
        elif colorMapStr == 'Blues':
            self.colorMapGC = get_cmap('Blues')
        elif colorMapStr == 'Spectral':
            self.colorMapGC = get_cmap('spectral')
        elif colorMapStr == 'Grayscale':
            self.colorMapGC = get_cmap('gist_yarg')
        elif colorMapStr == 'Discrete':
            discrete_map = [(0, 0, 0)]
            discrete_map.append((0, 0, 0))
            discrete_map.append((0, 0, 0))

            discrete_map.append((0, 0, 0))
            discrete_map.append((141 / 255.0, 211 / 255.0, 199 / 255.0))
            discrete_map.append((255 / 255.0, 255 / 255.0, 179 / 255.0))
            discrete_map.append((190 / 255.0, 186 / 255.0, 218 / 255.0))
            discrete_map.append((251 / 255.0, 128 / 255.0, 114 / 255.0))
            discrete_map.append((128 / 255.0, 177 / 255.0, 211 / 255.0))
            discrete_map.append((253 / 255.0, 180 / 255.0, 98 / 255.0))
            discrete_map.append((179 / 255.0, 222 / 255.0, 105 / 255.0))
            discrete_map.append((252 / 255.0, 205 / 255.0, 229 / 255.0))
            discrete_map.append((217 / 255.0, 217 / 255.0, 217 / 255.0))
            discrete_map.append((188 / 255.0, 128 / 255.0, 189 / 255.0))
            discrete_map.append((204 / 255.0, 235 / 255.0, 197 / 255.0))
            discrete_map.append((255 / 255.0, 237 / 255.0, 111 / 255.0))
            discrete_map.append((1, 1, 1))

            discrete_map.append((0, 0, 0))
            discrete_map.append((0, 0, 0))
            discrete_map.append((0, 0, 0))
            self.colorMapGC = LinearSegmentedColormap.from_list('GC_DISCRETE',
                                                                discrete_map,
                                                                N=20)

        elif colorMapStr == 'DiscretePaired':
            discrete_map = [(0, 0, 0)]
            discrete_map.append((0, 0, 0))
            discrete_map.append((0, 0, 0))

            discrete_map.append((0, 0, 0))
            discrete_map.append((166 / 255.0, 206 / 255.0, 227 / 255.0))
            discrete_map.append((31 / 255.0, 120 / 255.0, 180 / 255.0))
            discrete_map.append((178 / 255.0, 223 / 255.0, 138 / 255.0))
            discrete_map.append((51 / 255.0, 160 / 255.0, 44 / 255.0))
            discrete_map.append((251 / 255.0, 154 / 255.0, 153 / 255.0))
            discrete_map.append((227 / 255.0, 26 / 255.0, 28 / 255.0))
            discrete_map.append((253 / 255.0, 191 / 255.0, 111 / 255.0))
            discrete_map.append((255 / 255.0, 127 / 255.0, 0 / 255.0))
            discrete_map.append((202 / 255.0, 178 / 255.0, 214 / 255.0))
            discrete_map.append((106 / 255.0, 61 / 255.0, 154 / 255.0))
            discrete_map.append((255 / 255.0, 255 / 255.0, 179 / 255.0))
            discrete_map.append((217 / 255.0, 95 / 255.0, 2 / 255.0))
            discrete_map.append((1, 1, 1))

            discrete_map.append((0, 0, 0))
            discrete_map.append((0, 0, 0))
            discrete_map.append((0, 0, 0))
            self.colorMapGC = LinearSegmentedColormap.from_list('GC_DISCRETE',
                                                                discrete_map,
                                                                N=20)

    def plotStoitNames(self, ax):
        """Plot stoit names on an existing axes"""
        outer_index = 0
        for corner in self.corners:
            ax.text(corner[0],
                    corner[1],
                    corner[2],
                    self.stoitColNames[outer_index],
                    color='#000000')
            outer_index += 1

    def plotUnbinned(self,
                     timer,
                     coreCut,
                     transform=True,
                     ignoreContigLengths=False):
        """Plot all contigs over a certain length which are unbinned"""
        self.loadData(timer, "((length >= " + str(coreCut) + ") & (bid == 0))")

        if transform:
            self.transformCP(timer)
        else:
            if self.numStoits == 3:
                self.transformedCP = self.covProfiles
            else:
                print("Number of stoits != 3. You need to transform")
                self.transformCP(timer)

        fig = plt.figure()
        ax1 = fig.add_subplot(111, projection='3d')
        if ignoreContigLengths:
            sc = ax1.scatter(self.transformedCP[:, 0],
                             self.transformedCP[:, 1],
                             self.transformedCP[:, 2],
                             edgecolors='none',
                             c=self.contigGCs,
                             cmap=self.colorMapGC,
                             vmin=0.0,
                             vmax=1.0,
                             s=10,
                             marker='.')
        else:
            sc = ax1.scatter(self.transformedCP[:, 0],
                             self.transformedCP[:, 1],
                             self.transformedCP[:, 2],
                             edgecolors='k',
                             c=self.contigGCs,
                             cmap=self.colorMapGC,
                             vmin=0.0,
                             vmax=1.0,
                             s=np_sqrt(self.contigLengths),
                             marker='.')
        sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect
        self.plotStoitNames(ax1)

        try:
            plt.show()
            plt.close(fig)
        except:
            print("Error showing image", exc_info()[0])
            raise
        del fig

    def plotAll(self,
                timer,
                coreCut,
                transform=True,
                ignoreContigLengths=False):
        """Plot all contigs over a certain length which are unbinned"""
        self.loadData(timer, "((length >= " + str(coreCut) + "))")
        if transform:
            self.transformCP(timer)
        else:
            if self.numStoits == 3:
                self.transformedCP = self.covProfiles
            else:
                print("Number of stoits != 3. You need to transform")
                self.transformCP(timer)

        fig = plt.figure()
        ax1 = fig.add_subplot(111, projection='3d')
        if ignoreContigLengths:
            sc = ax1.scatter(self.transformedCP[:, 0],
                             self.transformedCP[:, 1],
                             self.transformedCP[:, 2],
                             edgecolors='none',
                             c=self.contigGCs,
                             cmap=self.colorMapGC,
                             vmin=0.0,
                             vmax=1.0,
                             marker='.',
                             s=10.)
        else:
            sc = ax1.scatter(self.transformedCP[:, 0],
                             self.transformedCP[:, 1],
                             self.transformedCP[:, 2],
                             edgecolors='k',
                             c=self.contigGCs,
                             cmap=self.colorMapGC,
                             vmin=0.0,
                             vmax=1.0,
                             marker='.',
                             s=np_sqrt(self.contigLengths))

        sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect
        self.plotStoitNames(ax1)

        cbar = plt.colorbar(sc, shrink=0.5)
        cbar.ax.tick_params()
        cbar.ax.set_title("% GC", size=10)
        cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
        #import IPython; IPython.embed()
        cbar.ax.set_ylim([0.15, 0.85])
        mungeCbar(cbar)

        try:
            plt.show()
            plt.close(fig)
        except:
            print("Error showing image", exc_info()[0])
            raise
        del fig

    def plotTransViews(self, tag="fordens"):
        """Plot top, side and front views of the transformed data"""
        self.renderTransData(tag + "_top.png", azim=0, elev=90)
        self.renderTransData(tag + "_front.png", azim=0, elev=0)
        self.renderTransData(tag + "_side.png", azim=90, elev=0)

    def renderTransCPData(self,
                          fileName="",
                          show=True,
                          elev=45,
                          azim=45,
                          all=False,
                          showAxis=False,
                          primaryWidth=12,
                          primarySpace=3,
                          dpi=300,
                          format='png',
                          fig=None,
                          highlight=None,
                          restrictedBids=[],
                          alpha=1,
                          ignoreContigLengths=False):
        """Plot transformed data in 3D"""
        del_fig = False
        if (fig is None):
            fig = plt.figure()
            del_fig = True
        else:
            plt.clf()
        if (all):
            myAXINFO = {
                'x': {
                    'i': 0,
                    'tickdir': 1,
                    'juggled': (1, 0, 2),
                    'color': (0, 0, 0, 0, 0)
                },
                'y': {
                    'i': 1,
                    'tickdir': 0,
                    'juggled': (0, 1, 2),
                    'color': (0, 0, 0, 0, 0)
                },
                'z': {
                    'i': 2,
                    'tickdir': 0,
                    'juggled': (0, 2, 1),
                    'color': (0, 0, 0, 0, 0)
                },
            }

            ax = fig.add_subplot(131, projection='3d')
            sc = ax.scatter(self.transformedCP[:, 0],
                            self.transformedCP[:, 1],
                            self.transformedCP[:, 2],
                            edgecolors='k',
                            c=self.contigGCs,
                            cmap=self.colorMapGC,
                            vmin=0.0,
                            vmax=1.0,
                            marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect
            ax.azim = 0
            ax.elev = 0
            ax.set_xlim3d(0, self.scaleFactor)
            ax.set_ylim3d(0, self.scaleFactor)
            ax.set_zlim3d(0, self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO

            ax = fig.add_subplot(132, projection='3d')
            sc = ax.scatter(self.transformedCP[:, 0],
                            self.transformedCP[:, 1],
                            self.transformedCP[:, 2],
                            edgecolors='k',
                            c=self.contigGCs,
                            cmap=self.colorMapGC,
                            vmin=0.0,
                            vmax=1.0,
                            marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect
            ax.azim = 90
            ax.elev = 0
            ax.set_xlim3d(0, self.scaleFactor)
            ax.set_ylim3d(0, self.scaleFactor)
            ax.set_zlim3d(0, self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO

            ax = fig.add_subplot(133, projection='3d')
            sc = ax.scatter(self.transformedCP[:, 0],
                            self.transformedCP[:, 1],
                            self.transformedCP[:, 2],
                            edgecolors='k',
                            c=self.contigGCs,
                            cmap=self.colorMapGC,
                            vmin=0.0,
                            vmax=1.0,
                            marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect
            ax.azim = 0
            ax.elev = 90
            ax.set_xlim3d(0, self.scaleFactor)
            ax.set_ylim3d(0, self.scaleFactor)
            ax.set_zlim3d(0, self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO
        else:
            ax = fig.add_subplot(111, projection='3d')
            if len(restrictedBids) == 0:
                if highlight is None:
                    print("BF:", np_shape(self.transformedCP))
                    if ignoreContigLengths:
                        sc = ax.scatter(self.transformedCP[:, 0],
                                        self.transformedCP[:, 1],
                                        self.transformedCP[:, 2],
                                        edgecolors='none',
                                        c=self.contigGCs,
                                        cmap=self.colorMapGC,
                                        s=10.,
                                        vmin=0.0,
                                        vmax=1.0,
                                        marker='.')
                    else:
                        sc = ax.scatter(self.transformedCP[:, 0],
                                        self.transformedCP[:, 1],
                                        self.transformedCP[:, 2],
                                        edgecolors='none',
                                        c=self.contigGCs,
                                        cmap=self.colorMapGC,
                                        vmin=0.0,
                                        vmax=1.0,
                                        s=np_sqrt(self.contigLengths),
                                        marker='.')
                    sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect
                else:
                    #draw the opaque guys first
                    """
                    sc = ax.scatter(self.transformedCP[:,0],
                                    self.transformedCP[:,1],
                                    self.transformedCP[:,2],
                                    edgecolors='none',
                                    c=self.contigGCs,
                                    cmap=self.colorMapGC,
                                    vmin=0.0,
                                    vmax=1.0,
                                    s=100.,
                                    marker='s',
                                    alpha=alpha)
                    sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect
                    """
                    # now replot the highlighted guys
                    disp_vals = np_array([])
                    disp_GCs = np_array([])

                    thrower = {}
                    hide_vals = np_array([])
                    hide_GCs = np_array([])

                    num_points = 0
                    for bin in highlight:
                        for row_index in bin.rowIndices:
                            num_points += 1
                            disp_vals = np_append(
                                disp_vals, self.transformedCP[row_index])
                            disp_GCs = np_append(disp_GCs,
                                                 self.contigGCs[row_index])
                            thrower[row_index] = False
                    # reshape
                    disp_vals = np_reshape(disp_vals, (num_points, 3))

                    num_points = 0
                    for i in range(len(self.indices)):
                        try:
                            thrower[i]
                        except KeyError:
                            num_points += 1
                            hide_vals = np_append(hide_vals,
                                                  self.transformedCP[i])
                            hide_GCs = np_append(hide_GCs, self.contigGCs[i])
                    # reshape
                    hide_vals = np_reshape(hide_vals, (num_points, 3))

                    sc = ax.scatter(hide_vals[:, 0],
                                    hide_vals[:, 1],
                                    hide_vals[:, 2],
                                    edgecolors='none',
                                    c=hide_GCs,
                                    cmap=self.colorMapGC,
                                    vmin=0.0,
                                    vmax=1.0,
                                    s=100.,
                                    marker='s',
                                    alpha=alpha)
                    sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect

                    sc = ax.scatter(disp_vals[:, 0],
                                    disp_vals[:, 1],
                                    disp_vals[:, 2],
                                    edgecolors='none',
                                    c=disp_GCs,
                                    cmap=self.colorMapGC,
                                    vmin=0.0,
                                    vmax=1.0,
                                    s=10.,
                                    marker='.')
                    sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect

                    print(np_shape(disp_vals), np_shape(hide_vals),
                          np_shape(self.transformedCP))

                # render color bar
                cbar = plt.colorbar(sc, shrink=0.5)
                cbar.ax.tick_params()
                cbar.ax.set_title("% GC", size=10)
                cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
                cbar.ax.set_ylim([0.15, 0.85])
                mungeCbar(cbar)
            else:
                r_trans = np_array([])
                r_cols = np_array([])
                num_added = 0
                for i in range(len(self.indices)):
                    if self.binIds[i] not in restrictedBids:
                        r_trans = np_append(r_trans, self.transformedCP[i])
                        r_cols = np_append(r_cols, self.contigGCs[i])
                        num_added += 1
                r_trans = np_reshape(r_trans, (num_added, 3))
                print(np_shape(r_trans))
                #r_cols = np_reshape(r_cols, (num_added,3))
                sc = ax.scatter(r_trans[:, 0],
                                r_trans[:, 1],
                                r_trans[:, 2],
                                edgecolors='none',
                                c=r_cols,
                                cmap=self.colorMapGC,
                                s=10.,
                                vmin=0.0,
                                vmax=1.0,
                                marker='.')
                sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect

                # render color bar
                cbar = plt.colorbar(sc, shrink=0.5)
                cbar.ax.tick_params()
                cbar.ax.set_title("% GC", size=10)
                cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
                cbar.ax.set_ylim([0.15, 0.85])
                mungeCbar(cbar)

            ax.azim = azim
            ax.elev = elev
            ax.set_xlim3d(0, self.scaleFactor)
            ax.set_ylim3d(0, self.scaleFactor)
            ax.set_zlim3d(0, self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            if (not showAxis):
                ax.set_axis_off()

        if (fileName != ""):
            try:
                if (all):
                    fig.set_size_inches(3 * primaryWidth + 2 * primarySpace,
                                        primaryWidth)
                else:
                    fig.set_size_inches(primaryWidth, primaryWidth)
                plt.savefig(fileName, dpi=dpi, format=format)
            except:
                print("Error saving image", fileName, exc_info()[0])
                raise
        elif (show):
            try:
                plt.show()
            except:
                print("Error showing image", exc_info()[0])
                raise
        if del_fig:
            plt.close(fig)
            del fig

###############################################################################
###############################################################################
###############################################################################
###############################################################################

    def r2nderTransCPData(
        self,
        fig,
        alphaIndices=[],
        visibleIndices=[],
        alpha=1,
        ignoreContigLengths=False,
        elev=45,
        azim=45,
        fileName="",
        dpi=300,
        format='png',
        primaryWidth=6,
        title="",
        showAxis=False,
        showColorbar=True,
    ):
        """Plot transformed data in 3D"""
        # clear any existing plot
        plt.clf()
        ax = fig.add_subplot(111, projection='3d')

        # work out the coords an colours based on indices
        alpha_coords = self.transformedCP[alphaIndices]
        alpha_GCs = self.contigGCs[alphaIndices]
        visible_coords = self.transformedCP[visibleIndices]
        visible_GCs = self.contigGCs[visibleIndices]

        # lengths if needed
        if not ignoreContigLengths:
            alpha_lengths = self.contigLengths[alphaIndices]
            visible_lengths = self.contigLengths[visibleIndices]
        else:
            alpha_lengths = 10.
            visible_lengths = 10.

        # first plot alpha points
        if len(alpha_GCs) > 0:
            sc = ax.scatter(alpha_coords[:, 0],
                            alpha_coords[:, 1],
                            alpha_coords[:, 2],
                            edgecolors='none',
                            c=alpha_GCs,
                            cmap=self.colorMapGC,
                            vmin=0.0,
                            vmax=1.0,
                            s=alpha_lengths,
                            marker='.',
                            alpha=alpha)
            sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect

        # then plot full visible points
        if len(visible_GCs) > 0:
            sc = ax.scatter(visible_coords[:, 0],
                            visible_coords[:, 1],
                            visible_coords[:, 2],
                            edgecolors='none',
                            c=visible_GCs,
                            cmap=self.colorMapGC,
                            s=visible_lengths,
                            vmin=0.0,
                            vmax=1.0,
                            marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args: None  # disable depth transparency effect

        # render color bar
        if showColorbar:
            cbar = plt.colorbar(sc, shrink=0.5)
            cbar.ax.tick_params()
            cbar.ax.set_title("% GC", size=10)
            cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
            cbar.ax.set_ylim([0.15, 0.85])
            mungeCbar(cbar)

        # set aspect
        ax.azim = azim
        ax.elev = elev

        # make it purdy
        ax.set_xlim3d(0, self.scaleFactor)
        ax.set_ylim3d(0, self.scaleFactor)
        ax.set_zlim3d(0, self.scaleFactor)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_zticklabels([])
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_zticks([])

        plt.tight_layout()

        if title != "":
            plt.title(title)

        if (not showAxis):
            ax.set_axis_off()

        if (fileName != ""):
            try:
                fig.set_size_inches(primaryWidth, primaryWidth)
                plt.savefig(fileName, dpi=dpi, format=format)
            except:
                print("Error saving image", fileName, exc_info()[0])
                raise
        else:
            try:
                plt.show()
            except:
                print("Error showing image", exc_info()[0])
                raise
Пример #7
0
class ProfileManager:
    """Interacts with the groopm DataManager and local data fields

    Mostly a wrapper around a group of numpy arrays and a pytables quagmire
    """
    def __init__(self, dbFileName, force=False, scaleFactor=1000):
        # data
        self.dataManager = GMDataManager()  # most data is saved to hdf
        self.dbFileName = dbFileName        # db containing all the data we'd like to use
        self.condition = ""                 # condition will be supplied at loading time

        # --> NOTE: ALL of the arrays in this section are in sync
        # --> each one holds information for an individual contig
        self.indices = np_array([])         # indices into the data structure based on condition
        self.covProfiles = np_array([])     # coverage based coordinates
        self.transformedCP = np_array([])   # the munged data points
        self.corners = np_array([])         # the corners of the tranformed space
        self.TCentre = 0.                   # the centre of the coverage space
        self.transRadius = 0.               # distance from corner to centre of transformed space
        self.averageCoverages = np_array([])# average coverage across all stoits
        self.normCoverages = np_array([])   # norm of the raw coverage vectors
        self.kmerSigs = np_array([])        # raw kmer signatures
        self.kmerNormPC1 = np_array([])     # First PC of kmer sigs normalized to [0, 1]
        self.kmerPCs = np_array([])         # PCs of kmer sigs capturing specified variance
        self.kmerVarPC = np_array([])       # variance of each PC
        self.stoitColNames = np_array([])
        self.contigNames = np_array([])
        self.contigLengths = np_array([])
        self.contigGCs = np_array([])
        self.colorMapGC = None

        self.binIds = np_array([])          # list of bin IDs
        # --> end section

        # meta
        self.validBinIds = {}               # valid bin ids -> numMembers
        self.isLikelyChimeric = {}          # indicates if a bin is likely to be chimeric
        self.binnedRowIndices = {}          # dictionary of those indices which belong to some bin
        self.restrictedRowIndices = {}      # dictionary of those indices which can not be binned yet
        self.numContigs = 0                 # this depends on the condition given
        self.numStoits = 0                  # this depends on the data which was parsed

        # contig links
        self.links = {}

        # misc
        self.forceWriting = force           # overwrite existng values silently?
        self.scaleFactor = scaleFactor      # scale every thing in the transformed data to this dimension

    def loadData(self,
                 timer,
                 condition,                 # condition as set by another function
                 bids=[],                   # if this is set then only load those contigs with these bin ids
                 verbose=True,              # many to some output messages
                 silent=False,              # some to no output messages
                 loadCovProfiles=True,
                 loadKmerPCs=True,
                 loadKmerVarPC=True,
                 loadRawKmers=False,
                 makeColors=True,
                 loadContigNames=True,
                 loadContigLengths=True,
                 loadContigGCs=True,
                 loadBins=False,
                 loadLinks=False):
        """Load pre-parsed data"""

        timer.getTimeStamp()
        if(silent):
            verbose=False
        if verbose:
            print "Loading data from:", self.dbFileName

        try:
            self.numStoits = self.getNumStoits()
            self.condition = condition
            self.indices = self.dataManager.getConditionalIndices(self.dbFileName,
                                                                  condition=condition,
                                                                  silent=silent)
            if(verbose):
                print "    Loaded indices with condition:", condition
            self.numContigs = len(self.indices)

            if self.numContigs == 0:
                print "    ERROR: No contigs loaded using condition:", condition
                return

            if(not silent):
                print "    Working with: %d contigs" % self.numContigs

            if(loadCovProfiles):
                if(verbose):
                    print "    Loading coverage profiles"
                self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices)
                self.normCoverages = self.dataManager.getNormalisedCoverageProfiles(self.dbFileName, indices=self.indices)

                # work out average coverages
                self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles])

            if loadRawKmers:
                if(verbose):
                    print "    Loading RAW kmer sigs"
                self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices)

            if(loadKmerPCs):
                self.kmerPCs = self.dataManager.getKmerPCAs(self.dbFileName, indices=self.indices)

                if(verbose):
                    print "    Loading PCA kmer sigs (" + str(len(self.kmerPCs[0])) + " dimensional space)"

                self.kmerNormPC1 = np_copy(self.kmerPCs[:,0])
                self.kmerNormPC1 -= np_min(self.kmerNormPC1)
                self.kmerNormPC1 /= np_max(self.kmerNormPC1)

            if(loadKmerVarPC):
                self.kmerVarPC = self.dataManager.getKmerVarPC(self.dbFileName, indices=self.indices)

                if(verbose):
                    print "    Loading PCA kmer variance (total variance: %.2f" % np_sum(self.kmerVarPC) + ")"

            if(loadContigNames):
                if(verbose):
                    print "    Loading contig names"
                self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices)

            if(loadContigLengths):
                self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices)
                if(verbose):
                    print "    Loading contig lengths (Total: %d BP)" % ( sum(self.contigLengths) )

            if(loadContigGCs):
                self.contigGCs = self.dataManager.getContigGCs(self.dbFileName, indices=self.indices)
                if(verbose):
                    print "    Loading contig GC ratios (Average GC: %0.3f)" % ( np_mean(self.contigGCs) )

            if(makeColors):
                if(verbose):
                    print "    Creating color map"

                # use HSV to RGB to generate colors
                S = 1       # SAT and VAL remain fixed at 1. Reduce to make
                V = 1       # Pastels if that's your preference...
                self.colorMapGC = self.createColorMapHSV()

            if(loadBins):
                if(verbose):
                    print "    Loading bin assignments"

                self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices)

                if len(bids) != 0: # need to make sure we're not restricted in terms of bins
                    bin_stats = self.getBinStats()
                    for bid in bids:
                        try:
                            self.validBinIds[bid] = bin_stats[bid][0]
                            self.isLikelyChimeric[bid]= bin_stats[bid][1]
                        except KeyError:
                            self.validBinIds[bid] = 0
                            self.isLikelyChimeric[bid]= False

                else:
                    bin_stats = self.getBinStats()
                    for bid in bin_stats:
                        self.validBinIds[bid] = bin_stats[bid][0]
                        self.isLikelyChimeric[bid] = bin_stats[bid][1]

                # fix the binned indices
                self.binnedRowIndices = {}
                for i in range(len(self.indices)):
                    if(self.binIds[i] != 0):
                        self.binnedRowIndices[i] = True
            else:
                # we need zeros as bin indicies then...
                self.binIds = np_zeros(len(self.indices))

            if(loadLinks):
                self.loadLinks()

            self.stoitColNames = self.getStoitColNames()

        except:
            print "Error loading DB:", self.dbFileName, exc_info()[0]
            raise

    def reduceIndices(self, deadRowIndices):
        """purge indices from the data structures

        Be sure that deadRowIndices are sorted ascending
        """
        # strip out the other values
        self.indices = np_delete(self.indices, deadRowIndices, axis=0)
        self.covProfiles = np_delete(self.covProfiles, deadRowIndices, axis=0)
        self.transformedCP = np_delete(self.transformedCP, deadRowIndices, axis=0)
        self.contigNames = np_delete(self.contigNames, deadRowIndices, axis=0)
        self.contigLengths = np_delete(self.contigLengths, deadRowIndices, axis=0)
        self.contigGCs = np_delete(self.contigGCs, deadRowIndices, axis=0)
        #self.kmerSigs = np_delete(self.kmerSigs, deadRowIndices, axis=0)
        self.kmerPCs = np_delete(self.kmerPCs, deadRowIndices, axis=0)
        self.binIds = np_delete(self.binIds, deadRowIndices, axis=0)

#------------------------------------------------------------------------------
# GET / SET

    def getNumStoits(self):
        """return the value of numStoits in the metadata tables"""
        return self.dataManager.getNumStoits(self.dbFileName)

    def getMerColNames(self):
        """return the value of merColNames in the metadata tables"""
        return self.dataManager.getMerColNames(self.dbFileName)

    def getMerSize(self):
        """return the value of merSize in the metadata tables"""
        return self.dataManager.getMerSize(self.dbFileName)

    def getNumMers(self):
        """return the value of numMers in the metadata tables"""
        return self.dataManager.getNumMers(self.dbFileName)

### USE the member vars instead!
#    def getNumCons(self):
#        """return the value of numCons in the metadata tables"""
#        return self.dataManager.getNumCons(self.dbFileName)

    def getNumBins(self):
        """return the value of numBins in the metadata tables"""
        return self.dataManager.getNumBins(self.dbFileName)

    def setNumBins(self, numBins):
        """set the number of bins"""
        self.dataManager.setNumBins(self.dbFileName, numBins)

    def getStoitColNames(self):
        """return the value of stoitColNames in the metadata tables"""
        return np_array(self.dataManager.getStoitColNames(self.dbFileName).split(","))

    def isClustered(self):
        """Has the data been clustered already"""
        return self.dataManager.isClustered(self.dbFileName)

    def setClustered(self):
        """Save that the db has been clustered"""
        self.dataManager.setClustered(self.dbFileName, True)

    def isComplete(self):
        """Has the data been *completely* clustered already"""
        return self.dataManager.isComplete(self.dbFileName)

    def setComplete(self):
        """Save that the db has been completely clustered"""
        self.dataManager.setComplete(self.dbFileName, True)

    def getBinStats(self):
        """Go through all the "bins" array and make a list of unique bin ids vs number of contigs"""
        return self.dataManager.getBinStats(self.dbFileName)

    def setBinStats(self, binStats):
        """Store the valid bin Ids and number of members

        binStats is a list of tuples which looks like:
        [ (bid, numMembers, isLikelyChimeric) ]
        Note that this call effectively nukes the existing table
        """
        self.dataManager.setBinStats(self.dbFileName, binStats)
        self.setNumBins(len(binStats))

    def setBinAssignments(self, assignments, nuke=False):
        """Save our bins into the DB"""
        self.dataManager.setBinAssignments(self.dbFileName,
                                           assignments,
                                           nuke=nuke)

    def loadLinks(self):
        """Extra wrapper 'cause I am dumb"""
        self.links = self.getLinks()

    def getLinks(self):
        """Get contig links"""
        # first we get the absolute links
        absolute_links = self.dataManager.restoreLinks(self.dbFileName, self.indices)
        # now convert this into plain old row_indices
        reverse_index_lookup = {}
        for i in range(len(self.indices)):
            reverse_index_lookup[self.indices[i]] = i

        # now convert the absolute links to local ones
        relative_links = {}
        for cid in self.indices:
            local_cid = reverse_index_lookup[cid]
            relative_links[local_cid] = []
            try:
                for link in absolute_links[cid]:
                    relative_links[local_cid].append([reverse_index_lookup[link[0]], link[1], link[2], link[3]])
            except KeyError: # not everyone is linked
                pass

        return relative_links

#------------------------------------------------------------------------------
# DATA TRANSFORMATIONS

    def getAverageCoverage(self, rowIndex):
        """Return the average coverage for this contig across all stoits"""
        return sum(self.transformedCP[rowIndex])/self.numStoits

    def shuffleBAMs(self):
        """Make the data transformation deterministic by reordering the bams"""
        # first we should make a subset of the total data
        # we'd like to take it down to about 1500 or so RI's
        # but we'd like to do this in a repeatable way
        ideal_contig_num = 1500
        sub_cons = range(len(self.indices))
        while len(sub_cons) > ideal_contig_num:
            # select every second contig when sorted by norm cov
            cov_sorted = np_argsort(self.normCoverages[sub_cons])
            sub_cons = np_array([sub_cons[cov_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))])

            if len(sub_cons) > ideal_contig_num:
                # select every second contig when sorted by mer PC1
                mer_sorted = np_argsort(self.kmerNormPC1[sub_cons])
                sub_cons = np_array([sub_cons[mer_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))])

        # now that we have a subset, calculate the distance between each of the untransformed vectors
        num_sc = len(sub_cons)

        # log shift the coverages towards the origin
        sub_covs = np_transpose([self.covProfiles[i]*(np_log10(self.normCoverages[i])/self.normCoverages[i]) for i in sub_cons])
        sq_dists = cdist(sub_covs,sub_covs,'cityblock')
        dists = squareform(sq_dists)

        # initialise a list of left, right neighbours
        lr_dict = {}
        for i in range(self.numStoits):
            lr_dict[i] = []

        too_big = 10000
        while True:
            closest = np_argmin(dists)
            if dists[closest] == too_big:
                break
            (i,j) = self.small2indices(closest, self.numStoits-1)
            lr_dict[j].append(i)
            lr_dict[i].append(j)

            # mark these guys as neighbours
            if len(lr_dict[i]) == 2:
                # no more than 2 neighbours
                sq_dists[i,:] = too_big
                sq_dists[:,i] = too_big
                sq_dists[i,i] = 0.0
            if len(lr_dict[j]) == 2:
                # no more than 2 neighbours
                sq_dists[j,:] = too_big
                sq_dists[:,j] = too_big
                sq_dists[j,j] = 0.0

            # fix the dist matrix
            sq_dists[j,i] = too_big
            sq_dists[i,j] = too_big
            dists = squareform(sq_dists)

        # now make the ordering
        ordering = [0, lr_dict[0][0]]
        done = 2
        while done < self.numStoits:
            last = ordering[done-1]
            if lr_dict[last][0] == ordering[done-2]:
                ordering.append(lr_dict[last][1])
                last = lr_dict[last][1]
            else:
                ordering.append(lr_dict[last][0])
                last = lr_dict[last][0]
            done+=1

        # reshuffle the contig order!
        # yay for bubble sort!
        working = np_arange(self.numStoits)
        for i in range(1, self.numStoits):
            # where is this guy in the list
            loc = list(working).index(ordering[i])
            if loc != i:
                # swap the columns
                self.covProfiles[:,[i,loc]] = self.covProfiles[:,[loc,i]]
                self.stoitColNames[[i,loc]] = self.stoitColNames[[loc,i]]
                working[[i,loc]] = working[[loc,i]]

    def transformCP(self, timer, silent=False, nolog=False):
        """Do the main transformation on the coverage profile data"""
        if(not silent):
            print "    Reticulating splines"
        self.transformedCP = self.dataManager.getTransformedCoverageProfiles(self.dbFileName, indices=self.indices)
        self.corners = self.dataManager.getTransformedCoverageCorners(self.dbFileName)
        self.TCentre = np_mean(self.corners, axis=0)
        self.transRadius = np_norm(self.corners[0] - self.TCentre)

#------------------------------------------------------------------------------
# DEBUG CRUFT

    def rewriteBins(self):
        """rewrite the bins table in hdf5 based on numbers in meta-contigs"""
        bins = self.dataManager.getBins(self.dbFileName)
        bin_store = {}
        for c in bins:
            if c != 0:
                try:
                    bin_store[c] += 1
                except KeyError:
                    bin_store[c] = 1

        bin_stats = []
        for bid in bin_store:
            # [(bid, size, likelyChimeric)]
            bin_stats.append((bid, bin_store[bid], False))

        self.setBinStats(bin_stats)

#------------------------------------------------------------------------------
# IO and IMAGE RENDERING

    def createColorMapHSV(self):
      S = 1.0
      V = 1.0
      return LinearSegmentedColormap.from_list('GC', [htr((1.0 + np_sin(np_pi * (val/1000.0) - np_pi/2))/2., S, V) for val in xrange(0, 1000)], N=1000)

    def setColorMap(self, colorMapStr):
        if colorMapStr == 'HSV':
            S = 1
            V = 1
            self.colorMapGC = self.createColorMapHSV()
        elif colorMapStr == 'Accent':
            self.colorMapGC = get_cmap('Accent')
        elif colorMapStr == 'Blues':
            self.colorMapGC = get_cmap('Blues')
        elif colorMapStr == 'Spectral':
            self.colorMapGC = get_cmap('spectral')
        elif colorMapStr == 'Grayscale':
            self.colorMapGC = get_cmap('gist_yarg')
        elif colorMapStr == 'Discrete':
            discrete_map = [(0,0,0)]
            discrete_map.append((0,0,0))
            discrete_map.append((0,0,0))

            discrete_map.append((0,0,0))
            discrete_map.append((141/255.0,211/255.0,199/255.0))
            discrete_map.append((255/255.0,255/255.0,179/255.0))
            discrete_map.append((190/255.0,186/255.0,218/255.0))
            discrete_map.append((251/255.0,128/255.0,114/255.0))
            discrete_map.append((128/255.0,177/255.0,211/255.0))
            discrete_map.append((253/255.0,180/255.0,98/255.0))
            discrete_map.append((179/255.0,222/255.0,105/255.0))
            discrete_map.append((252/255.0,205/255.0,229/255.0))
            discrete_map.append((217/255.0,217/255.0,217/255.0))
            discrete_map.append((188/255.0,128/255.0,189/255.0))
            discrete_map.append((204/255.0,235/255.0,197/255.0))
            discrete_map.append((255/255.0,237/255.0,111/255.0))
            discrete_map.append((1,1,1))

            discrete_map.append((0,0,0))
            discrete_map.append((0,0,0))
            discrete_map.append((0,0,0))
            self.colorMapGC = LinearSegmentedColormap.from_list('GC_DISCRETE', discrete_map, N=20)

        elif colorMapStr == 'DiscretePaired':
            discrete_map = [(0,0,0)]
            discrete_map.append((0,0,0))
            discrete_map.append((0,0,0))

            discrete_map.append((0,0,0))
            discrete_map.append((166/255.0,206/255.0,227/255.0))
            discrete_map.append((31/255.0,120/255.0,180/255.0))
            discrete_map.append((178/255.0,223/255.0,138/255.0))
            discrete_map.append((51/255.0,160/255.0,44/255.0))
            discrete_map.append((251/255.0,154/255.0,153/255.0))
            discrete_map.append((227/255.0,26/255.0,28/255.0))
            discrete_map.append((253/255.0,191/255.0,111/255.0))
            discrete_map.append((255/255.0,127/255.0,0/255.0))
            discrete_map.append((202/255.0,178/255.0,214/255.0))
            discrete_map.append((106/255.0,61/255.0,154/255.0))
            discrete_map.append((255/255.0,255/255.0,179/255.0))
            discrete_map.append((217/255.0,95/255.0,2/255.0))
            discrete_map.append((1,1,1))

            discrete_map.append((0,0,0))
            discrete_map.append((0,0,0))
            discrete_map.append((0,0,0))
            self.colorMapGC = LinearSegmentedColormap.from_list('GC_DISCRETE', discrete_map, N=20)

    def plotStoitNames(self, ax):
        """Plot stoit names on an existing axes"""
        outer_index = 0
        for corner in self.corners:
            ax.text(corner[0],
                    corner[1],
                    corner[2],
                    self.stoitColNames[outer_index],
                    color='#000000'
                    )
            outer_index += 1

    def plotUnbinned(self, timer, coreCut, transform=True, ignoreContigLengths=False):
        """Plot all contigs over a certain length which are unbinned"""
        self.loadData(timer, "((length >= "+str(coreCut)+") & (bid == 0))")

        if transform:
            self.transformCP(timer)
        else:
            if self.numStoits == 3:
                self.transformedCP = self.covProfiles
            else:
                print "Number of stoits != 3. You need to transform"
                self.transformCP(timer)

        fig = plt.figure()
        ax1 = fig.add_subplot(111, projection='3d')
        if ignoreContigLengths:
            sc = ax1.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors='none', c=self.contigGCs, cmap=self.colorMapGC, vmin=0.0, vmax=1.0, s=10, marker='.')
        else:
            sc = ax1.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors='k', c=self.contigGCs, cmap=self.colorMapGC, vmin=0.0, vmax=1.0, s=np_sqrt(self.contigLengths), marker='.')
        sc.set_edgecolors = sc.set_facecolors = lambda *args:None  # disable depth transparency effect
        self.plotStoitNames(ax1)

        try:
            plt.show()
            plt.close(fig)
        except:
            print "Error showing image", exc_info()[0]
            raise
        del fig

    def plotAll(self, timer, coreCut, transform=True, ignoreContigLengths=False):
        """Plot all contigs over a certain length which are unbinned"""
        self.loadData(timer, "((length >= "+str(coreCut)+"))")
        if transform:
            self.transformCP(timer)
        else:
            if self.numStoits == 3:
                self.transformedCP = self.covProfiles
            else:
                print "Number of stoits != 3. You need to transform"
                self.transformCP(timer)

        fig = plt.figure()
        ax1 = fig.add_subplot(111, projection='3d')
        if ignoreContigLengths:
            sc = ax1.scatter(self.transformedCP[:,0],
                             self.transformedCP[:,1],
                             self.transformedCP[:,2],
                             edgecolors='none',
                             c=self.contigGCs,
                             cmap=self.colorMapGC,
                             vmin=0.0,
                             vmax=1.0,
                             marker='.',
                             s=10.
                             )
        else:
            sc = ax1.scatter(self.transformedCP[:,0],
                             self.transformedCP[:,1],
                             self.transformedCP[:,2],
                             edgecolors='k',
                             c=self.contigGCs,
                             cmap=self.colorMapGC,
                             vmin=0.0,
                             vmax=1.0,
                             marker='.',
                             s=np_sqrt(self.contigLengths)
                             )

        sc.set_edgecolors = sc.set_facecolors = lambda *args:None  # disable depth transparency effect
        self.plotStoitNames(ax1)

        cbar = plt.colorbar(sc, shrink=0.5)
        cbar.ax.tick_params()
        cbar.ax.set_title("% GC", size=10)
        cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
        #import IPython; IPython.embed()
        cbar.ax.set_ylim([0.15, 0.85])
        mungeCbar(cbar)

        try:
            plt.show()
            plt.close(fig)
        except:
            print "Error showing image", exc_info()[0]
            raise
        del fig


    def plotTransViews(self, tag="fordens"):
        """Plot top, side and front views of the transformed data"""
        self.renderTransData(tag+"_top.png",azim = 0, elev = 90)
        self.renderTransData(tag+"_front.png",azim = 0, elev = 0)
        self.renderTransData(tag+"_side.png",azim = 90, elev = 0)

    def renderTransCPData(self,
                          fileName="",
                          show=True,
                          elev=45,
                          azim=45,
                          all=False,
                          showAxis=False,
                          primaryWidth=12,
                          primarySpace=3,
                          dpi=300,
                          format='png',
                          fig=None,
                          highlight=None,
                          restrictedBids=[],
                          alpha=1,
                          ignoreContigLengths=False):
        """Plot transformed data in 3D"""
        del_fig = False
        if(fig is None):
            fig = plt.figure()
            del_fig = True
        else:
            plt.clf()
        if(all):
            myAXINFO = {
                'x': {'i': 0, 'tickdir': 1, 'juggled': (1, 0, 2),
                'color': (0, 0, 0, 0, 0)},
                'y': {'i': 1, 'tickdir': 0, 'juggled': (0, 1, 2),
                'color': (0, 0, 0, 0, 0)},
                'z': {'i': 2, 'tickdir': 0, 'juggled': (0, 2, 1),
                'color': (0, 0, 0, 0, 0)},
            }

            ax = fig.add_subplot(131, projection='3d')
            sc = ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors='k', c=self.contigGCs, cmap=self.colorMapGC, vmin=0.0, vmax=1.0, marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args:None  # disable depth transparency effect
            ax.azim = 0
            ax.elev = 0
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO

            ax = fig.add_subplot(132, projection='3d')
            sc = ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors='k', c=self.contigGCs, cmap=self.colorMapGC, vmin=0.0, vmax=1.0, marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args:None  # disable depth transparency effect
            ax.azim = 90
            ax.elev = 0
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO

            ax = fig.add_subplot(133, projection='3d')
            sc = ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors='k', c=self.contigGCs, cmap=self.colorMapGC, vmin=0.0, vmax=1.0, marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args:None  # disable depth transparency effect
            ax.azim = 0
            ax.elev = 90
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO
        else:
            ax = fig.add_subplot(111, projection='3d')
            if len(restrictedBids) == 0:
                if highlight is None:
                    print "BF:", np_shape(self.transformedCP)
                    if ignoreContigLengths:
                        sc = ax.scatter(self.transformedCP[:,0],
                                   self.transformedCP[:,1],
                                   self.transformedCP[:,2],
                                   edgecolors='none',
                                   c=self.contigGCs,
                                   cmap=self.colorMapGC,
                                   s=10.,
                                   vmin=0.0,
                                   vmax=1.0,
                                   marker='.')
                    else:
                        sc = ax.scatter(self.transformedCP[:,0],
                                   self.transformedCP[:,1],
                                   self.transformedCP[:,2],
                                   edgecolors='none',
                                   c=self.contigGCs,
                                   cmap=self.colorMapGC,
                                   vmin=0.0,
                                   vmax=1.0,
                                   s=np_sqrt(self.contigLengths),
                                   marker='.')
                    sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect
                else:
                    #draw the opaque guys first
                    """
                    sc = ax.scatter(self.transformedCP[:,0],
                                    self.transformedCP[:,1],
                                    self.transformedCP[:,2],
                                    edgecolors='none',
                                    c=self.contigGCs,
                                    cmap=self.colorMapGC,
                                    vmin=0.0,
                                    vmax=1.0,
                                    s=100.,
                                    marker='s',
                                    alpha=alpha)
                    sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect
                    """
                    # now replot the highlighted guys
                    disp_vals = np_array([])
                    disp_GCs = np_array([])

                    thrower = {}
                    hide_vals = np_array([])
                    hide_GCs = np_array([])

                    num_points = 0
                    for bin in highlight:
                        for row_index in bin.rowIndices:
                            num_points += 1
                            disp_vals = np_append(disp_vals, self.transformedCP[row_index])
                            disp_GCs = np_append(disp_GCs, self.contigGCs[row_index])
                            thrower[row_index] = False
                    # reshape
                    disp_vals = np_reshape(disp_vals, (num_points, 3))

                    num_points = 0
                    for i in range(len(self.indices)):
                        try:
                            thrower[i]
                        except KeyError:
                            num_points += 1
                            hide_vals = np_append(hide_vals, self.transformedCP[i])
                            hide_GCs = np_append(hide_GCs, self.contigGCs[i])
                    # reshape
                    hide_vals = np_reshape(hide_vals, (num_points, 3))

                    sc = ax.scatter(hide_vals[:,0],
                                    hide_vals[:,1],
                                    hide_vals[:,2],
                                    edgecolors='none',
                                    c=hide_GCs,
                                    cmap=self.colorMapGC,
                                    vmin=0.0,
                                    vmax=1.0,
                                    s=100.,
                                    marker='s',
                                    alpha=alpha)
                    sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect

                    sc = ax.scatter(disp_vals[:,0],
                                    disp_vals[:,1],
                                    disp_vals[:,2],
                                    edgecolors='none',
                                    c=disp_GCs,
                                    cmap=self.colorMapGC,
                                    vmin=0.0,
                                    vmax=1.0,
                                    s=10.,
                                    marker='.')
                    sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect

                    print np_shape(disp_vals), np_shape(hide_vals), np_shape(self.transformedCP)

                # render color bar
                cbar = plt.colorbar(sc, shrink=0.5)
                cbar.ax.tick_params()
                cbar.ax.set_title("% GC", size=10)
                cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
                cbar.ax.set_ylim([0.15, 0.85])
                mungeCbar(cbar)
            else:
                r_trans = np_array([])
                r_cols=np_array([])
                num_added = 0
                for i in range(len(self.indices)):
                    if self.binIds[i] not in restrictedBids:
                        r_trans = np_append(r_trans, self.transformedCP[i])
                        r_cols = np_append(r_cols, self.contigGCs[i])
                        num_added += 1
                r_trans = np_reshape(r_trans, (num_added,3))
                print np_shape(r_trans)
                #r_cols = np_reshape(r_cols, (num_added,3))
                sc = ax.scatter(r_trans[:,0],
                                r_trans[:,1],
                                r_trans[:,2],
                                edgecolors='none',
                                c=r_cols,
                                cmap=self.colorMapGC,
                                s=10.,
                                vmin=0.0,
                                vmax=1.0,
                                marker='.')
                sc.set_edgecolors = sc.set_facecolors = lambda *args:None  # disable depth transparency effect

                # render color bar
                cbar = plt.colorbar(sc, shrink=0.5)
                cbar.ax.tick_params()
                cbar.ax.set_title("% GC", size=10)
                cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
                cbar.ax.set_ylim([0.15, 0.85])
                mungeCbar(cbar)

            ax.azim = azim
            ax.elev = elev
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            if(not showAxis):
                ax.set_axis_off()

        if(fileName != ""):
            try:
                if(all):
                    fig.set_size_inches(3*primaryWidth+2*primarySpace,primaryWidth)
                else:
                    fig.set_size_inches(primaryWidth,primaryWidth)
                plt.savefig(fileName,dpi=dpi,format=format)
            except:
                print "Error saving image",fileName, exc_info()[0]
                raise
        elif(show):
            try:
                plt.show()
            except:
                print "Error showing image", exc_info()[0]
                raise
        if del_fig:
            plt.close(fig)
            del fig

###############################################################################
###############################################################################
###############################################################################
###############################################################################

    def r2nderTransCPData(self,
                          fig,
                          alphaIndices=[],
                          visibleIndices=[],
                          alpha=1,
                          ignoreContigLengths=False,
                          elev=45,
                          azim=45,
                          fileName="",
                          dpi=300,
                          format='png',
                          primaryWidth=6,
                          title="",
                          showAxis=False,
                          showColorbar=True,):
        """Plot transformed data in 3D"""
        # clear any existing plot
        plt.clf()
        ax = fig.add_subplot(111, projection='3d')

        # work out the coords an colours based on indices
        alpha_coords = self.transformedCP[alphaIndices]
        alpha_GCs = self.contigGCs[alphaIndices]
        visible_coords = self.transformedCP[visibleIndices]
        visible_GCs = self.contigGCs[visibleIndices]

        # lengths if needed
        if not ignoreContigLengths:
            alpha_lengths = self.contigLengths[alphaIndices]
            visible_lengths = self.contigLengths[visibleIndices]
        else:
            alpha_lengths = 10.
            visible_lengths = 10.

        # first plot alpha points
        if len(alpha_GCs) > 0:
            sc = ax.scatter(alpha_coords[:,0],
                            alpha_coords[:,1],
                            alpha_coords[:,2],
                            edgecolors='none',
                            c=alpha_GCs,
                            cmap=self.colorMapGC,
                            vmin=0.0,
                            vmax=1.0,
                            s=alpha_lengths,
                            marker='.',
                            alpha=alpha)
            sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect

        # then plot full visible points
        if len(visible_GCs) > 0:
            sc = ax.scatter(visible_coords[:,0],
                            visible_coords[:,1],
                            visible_coords[:,2],
                            edgecolors='none',
                            c=visible_GCs,
                            cmap=self.colorMapGC,
                            s=visible_lengths,
                            vmin=0.0,
                            vmax=1.0,
                            marker='.')
            sc.set_edgecolors = sc.set_facecolors = lambda *args:None # disable depth transparency effect

        # render color bar
        if showColorbar:
            cbar = plt.colorbar(sc, shrink=0.5)
            cbar.ax.tick_params()
            cbar.ax.set_title("% GC", size=10)
            cbar.set_ticks([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8])
            cbar.ax.set_ylim([0.15, 0.85])
            mungeCbar(cbar)

        # set aspect
        ax.azim = azim
        ax.elev = elev

        # make it purdy
        ax.set_xlim3d(0,self.scaleFactor)
        ax.set_ylim3d(0,self.scaleFactor)
        ax.set_zlim3d(0,self.scaleFactor)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_zticklabels([])
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_zticks([])

        plt.tight_layout()

        if title != "":
            plt.title(title)

        if(not showAxis):
            ax.set_axis_off()

        if(fileName != ""):
            try:
                fig.set_size_inches(primaryWidth,primaryWidth)
                plt.savefig(fileName,dpi=dpi,format=format)
            except:
                print "Error saving image",fileName, exc_info()[0]
                raise
        else:
            try:
                plt.show()
            except:
                print "Error showing image", exc_info()[0]
                raise
Пример #8
0
class ProfileManager:
    """Interacts with the groopm DataManager and local data fields
    
    Mostly a wrapper around a group of numpy arrays and a pytables quagmire
    """
    def __init__(self, dbFileName, force=False, scaleFactor=1000):
        # data
        self.dataManager = GMDataManager()  # most data is saved to hdf
        self.dbFileName = dbFileName        # db containing all the data we'd like to use
        self.condition = ""                 # condition will be supplied at loading time
        # --> NOTE: ALL of the arrays in this section are in sync
        # --> each one holds information for an individual contig 
        self.indices = np_array([])        # indices into the data structure based on condition
        self.covProfiles = np_array([])     # coverage based coordinates
        self.transformedCP = np_array([])   # the munged data points
        self.averageCoverages = np_array([]) # average coverage across all stoits
        self.kmerSigs = np_array([])        # raw kmer signatures
        self.kmerVals = np_array([])        # PCA'd kmer sigs

        self.contigNames = np_array([])
        self.contigLengths = np_array([])
        self.contigColours = np_array([])   # calculated from kmerVals
        
        self.binIds = np_array([])          # list of bin IDs
        # --> end section

        # meta                
        self.validBinIds = {}               # valid bin ids -> numMembers
        self.binnedRowIndicies = {}         # dictionary of those indices which belong to some bin
        self.restrictedRowIndicies = {}     # dictionary of those indices which can not be binned yet
        self.numContigs = 0                 # this depends on the condition given
        self.numStoits = 0                  # this depends on the data which was parsed

        # contig links
        self.links = {}
        
        # misc
        self.forceWriting = force           # overwrite existng values silently?
        self.scaleFactor = scaleFactor      # scale every thing in the transformed data to this dimension

    def loadData(self,
                 condition="",              # condition as set by another function
                 bids=[],                   # if this is set then only load those contigs with these bin ids
                 verbose=True,              # many to some output messages
                 silent=False,              # some to no output messages
                 loadCovProfiles=True,
                 loadKmerSigs=True,
                 makeColours=True,
                 loadContigNames=True,
                 loadContigLengths=True,
                 loadBins=False,
                 loadLinks=False):
        """Load pre-parsed data"""
        if(verbose):
            print "Loading data from:", self.dbFileName
        
        # check to see if we need to override the condition
        if(len(bids) != 0):
            condition = "((bid == "+str(bids[0])+")"
            for index in range (1,len(bids)):
                condition += " | (bid == "+str(bids[index])+")"
            condition += ")"
        if(silent):
            verbose=False
        try:
            self.numStoits = self.getNumStoits()
            self.condition = condition
            if(verbose):
                print "    Loading indices (", condition,")"
            self.indices = self.dataManager.getConditionalIndicies(self.dbFileName, condition=condition)
            self.numContigs = len(self.indices)
            
            if(not silent):
                print "    Working with: %d contigs" % self.numContigs

            if(loadCovProfiles):
                if(verbose):
                    print "    Loading coverage profiles"
                self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices)

                # work out average coverages
                self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles])

            if(loadKmerSigs):
                if(verbose):
                    print "    Loading kmer sigs"
                self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices)

                if(makeColours):
                    if(verbose):
                        print "    Creating colour profiles"
                    self.makeColourProfile()
                    # use HSV to RGB to generate colours
                    S = 1       # SAT and VAL remain fixed at 1. Reduce to make
                    V = 1       # Pastels if that's your preference...
                    self.contigColours = np_array([htr(val, S, V) for val in self.kmerVals])

            if(loadContigNames):
                if(verbose):
                    print "    Loading contig names"
                self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices)

            if(loadContigLengths):
                if(verbose):
                    print "    Loading contig lengths"
                self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices)
                print "    Contigs contain %d BP" % ( sum(self.contigLengths) )
            
            if(loadBins):
                if(verbose):
                    print "    Loading bins"
                self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices)
                if len(bids) != 0: # need to make sure we're not restricted in terms of bins
                    tmp_bids = self.getBinStats()
                    for bid in bids:
                        self.validBinIds[bid] = tmp_bids[bid]
                else:
                    self.validBinIds = self.getBinStats()

                # fix the binned indices
                self.binnedRowIndicies = {}
                for i in range(len(self.indices)):
                    if(self.binIds[i] != 0):
                        self.binnedRowIndicies[i] = True 
            else:
                # we need zeros as bin indicies then...
                self.binIds = np_zeros(len(self.indices))
                
            if(loadLinks):
                self.loadLinks()
            
        except:
            print "Error loading DB:", self.dbFileName, exc_info()[0]
            raise

    def reduceIndicies(self, deadRowIndicies):
        """purge indices from the data structures
        
        Be sure that deadRowIndicies are sorted ascending
        """
        # strip out the other values        
        self.indices = np_delete(self.indices, deadRowIndicies, axis=0)
        self.covProfiles = np_delete(self.covProfiles, deadRowIndicies, axis=0)
        self.transformedCP = np_delete(self.transformedCP, deadRowIndicies, axis=0)
        self.contigNames = np_delete(self.contigNames, deadRowIndicies, axis=0)
        self.contigLengths = np_delete(self.contigLengths, deadRowIndicies, axis=0)
        self.contigColours = np_delete(self.contigColours, deadRowIndicies, axis=0)
        self.kmerSigs = np_delete(self.kmerSigs, deadRowIndicies, axis=0)
        self.kmerVals = np_delete(self.kmerVals, deadRowIndicies, axis=0)
        self.binIds = np_delete(self.binIds, deadRowIndicies, axis=0)
        
#------------------------------------------------------------------------------
# GET / SET 

    def getNumStoits(self):
        """return the value of numStoits in the metadata tables"""
        return self.dataManager.getNumStoits(self.dbFileName)
            
    def getMerColNames(self):
        """return the value of merColNames in the metadata tables"""
        return self.dataManager.getMerColNames(self.dbFileName)
            
    def getMerSize(self):
        """return the value of merSize in the metadata tables"""
        return self.dataManager.getMerSize(self.dbFileName)

    def getNumMers(self):
        """return the value of numMers in the metadata tables"""
        return self.dataManager.getNumMers(self.dbFileName)

### USE the member vars instead!
#    def getNumCons(self):
#        """return the value of numCons in the metadata tables"""
#        return self.dataManager.getNumCons(self.dbFileName)

    def getNumBins(self):
        """return the value of numBins in the metadata tables"""
        return self.dataManager.getNumBins(self.dbFileName)
        
    def setNumBins(self, numBins):
        """set the number of bins"""
        self.dataManager.setNumBins(self.dbFileName, numBins)
        
    def getStoitColNames(self):
        """return the value of stoitColNames in the metadata tables"""
        return self.dataManager.getStoitColNames(self.dbFileName)
    
    def isClustered(self):
        """Has the data been clustered already"""
        return self.dataManager.isClustered(self.dbFileName)
    
    def setClustered(self):
        """Save that the db has been clustered"""
        self.dataManager.setClustered(self.dbFileName, True)
    
    def isComplete(self):
        """Has the data been *completely* clustered already"""
        return self.dataManager.isComplete(self.dbFileName)
    
    def setComplete(self):
        """Save that the db has been completely clustered"""
        self.dataManager.setComplete(self.dbFileName, True)

    def getBinStats(self):
        """Go through all the "bins" array and make a list of unique bin ids vs number of contigs"""
        return self.dataManager.getBinStats(self.dbFileName)
    
    def setBinStats(self, binStats):
        """Store the valid bin Ids and number of members
                
        binStats is a dictionary which looks like:
        { tableRow : [bid , numMembers] }
        """
        self.dataManager.setBinStats(self.dbFileName, binStats)
        self.setNumBins(len(binStats.keys()))

    def setBinAssignments(self, assignments):
        """Save our bins into the DB"""
        self.dataManager.setBinAssignments(self.dbFileName, assignments)

    def loadLinks(self):
        """Extra wrapper 'cause I am dumb"""
        self.links = self.getLinks()
        
    def getLinks(self):
        """Get contig links"""
        # first we get the absolute links
        absolute_links = self.dataManager.restoreLinks(self.dbFileName, self.indices)
        # now convert this into plain old row_indices
        reverse_index_lookup = {} 
        for i in range(len(self.indices)):
            reverse_index_lookup[self.indices[i]] = i

        # now convert the absolute links to local ones
        relative_links = {}
        for cid in self.indices:
            local_cid = reverse_index_lookup[cid]
            relative_links[local_cid] = []
            try:
                for link in absolute_links[cid]:
                    relative_links[local_cid].append([reverse_index_lookup[link[0]], link[1], link[2], link[3]])
            except KeyError: # not everyone is linked
                pass

        return relative_links
                 
#------------------------------------------------------------------------------
# DATA TRANSFORMATIONS 

    def getAverageCoverage(self, rowIndex):
        """Return the average coverage for this contig across all stoits"""
        return sum(self.transformedCP[rowIndex])/self.numStoits

    def transformCP(self, silent=False, nolog=False, min=None, max=None):
        """Do the main ransformation on the coverage profile data"""
        shrinkFn = np_log10
        if(nolog):
            shrinkFn = lambda x:x
         
        s = (self.numContigs,3)
        self.transformedCP = np_zeros(s)

        if(not silent):
            print "    Dimensionality reduction"

        # get the median distance from the origin
        unit_vectors = [(np_cos(i*2*np_pi/self.numStoits),np_sin(i*2*np_pi/self.numStoits)) for i in range(self.numStoits)]
        for i in range(len(self.indices)):
            norm = np_norm(self.covProfiles[i])
            if(norm != 0):
                radial = shrinkFn(norm)
            else:
                radial = norm
            shifted_vector = np_array([0.0,0.0])
            flat_vector = (self.covProfiles[i] / sum(self.covProfiles[i]))
            
            for j in range(self.numStoits):
                shifted_vector[0] += unit_vectors[j][0] * flat_vector[j]
                shifted_vector[1] += unit_vectors[j][1] * flat_vector[j]

            # log scale it towards the centre
            scaling_vector = shifted_vector * self.scaleFactor
            sv_size = np_norm(scaling_vector)
            if(sv_size > 1):
                shifted_vector /= shrinkFn(sv_size)

            self.transformedCP[i,0] = shifted_vector[0]
            self.transformedCP[i,1] = shifted_vector[1]
            self.transformedCP[i,2] = radial

        if(not silent):
            print "    Reticulating splines"
            
        # finally scale the matrix to make it equal in all dimensions
        if(min is None):                
            min = np_amin(self.transformedCP, axis=0)
            max = np_amax(self.transformedCP, axis=0)
            max = max - min
            max = max / (self.scaleFactor-1)

        for i in range(0,3):
            self.transformedCP[:,i] = (self.transformedCP[:,i] -  min[i])/max[i]

        return(min,max)

    def makeColourProfile(self):
        """Make a colour profile based on ksig information"""
        working_data = np_array(self.kmerSigs, copy=True) 
        Center(working_data,verbose=0)
        p = PCA(working_data)
        components = p.pc()
        
        # now make the colour profile based on PC1
        self.kmerVals = np_array([float(i) for i in components[:,0]])
        
        # normalise to fit between 0 and 1
        self.kmerVals -= np_min(self.kmerVals)
        self.kmerVals /= np_max(self.kmerVals)
        if(False):
            plt.figure(1)
            plt.subplot(111)
            plt.plot(components[:,0], components[:,1], 'r.')
            plt.show()
        
    def rotateVectorAndScale(self, point, las, centerVector, delta_max=0.25):
        """
        Move a vector closer to the center of the positive quadrant
        
        Find the co-ordinates of its projection
        onto the surface of a hypersphere with radius R
        
        What?...  ...First some definitions:
       
        For starters, think in 3 dimensions, then take it out to N.
        Imagine all points (x,y,z) on the surface of a sphere
        such that all of x,y,z > 0. ie trapped within the positive
        quadrant.
       
        Consider the line x = y = z which passes through the origin
        and the point on the surface at the "center" of this quadrant.
        Call this line the "main mapping axis". Let the unit vector 
        coincident with this line be called A.
       
        Now think of any other vector V also located in the positive
        quadrant. The goal of this function is to move this vector
        closer to the MMA. Specifically, if we think about the plane
        which contains both V and A, we'd like to rotate V within this
        plane about the origin through phi degrees in the direction of
        A.
        
        Once this has been done, we'd like to project the rotated co-ords 
        onto the surface of a hypersphere with radius R. This is a simple
        scaling operation.
       
        The idea is that vectors closer to the corners should be pertubed
        more than those closer to the center.
        
        Set delta_max as the max percentage of the existing angle to be removed
        """
        theta = self.getAngBetween(point, centerVector)
        A = delta_max/((las)**2)
        B = delta_max/las
        delta = 2*B*theta - A *(theta**2) # the amount to shift
        V_p = point*(1-delta) + centerVector*delta
        return V_p/np_norm(V_p)
    
    def rad2deg(self, anglein):
        return 180*anglein/np_pi

    def getAngBetween(self, P1, P2):
        """Return the angle between two points (in radians)"""
        # find the existing angle between them theta
        c = np_dot(P1,P2)/np_norm(P1)/np_norm(P2) 
        # rounding errors hurt everyone...
        if(c > 1):
            c = 1
        elif(c < -1):
            c = -1
        return np_arccos(c) # in radians

#------------------------------------------------------------------------------
# IO and IMAGE RENDERING 

    def plotUnbinned(self, coreCut):
        """Plot all contigs over a certain length which are unbinned"""
        self.loadData(condition="((length >= "+str(coreCut)+") & (bid == 0))")
        self.transformCP()
        fig = plt.figure()
        ax1 = fig.add_subplot(111, projection='3d')
        ax1.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors=self.contigColours, c=self.contigColours, marker='.')
        try:
            plt.show()
            plt.close(fig)
        except:
            print "Error showing image", exc_info()[0]
            raise
        del fig


    def plotTransViews(self, tag="fordens"):
        """Plot top, side and front views of the transformed data"""
        self.renderTransData(tag+"_top.png",azim = 0, elev = 90)
        self.renderTransData(tag+"_front.png",azim = 0, elev = 0)
        self.renderTransData(tag+"_side.png",azim = 90, elev = 0)

    def renderTransCPData(self, fileName="", show=True, elev=45, azim=45, all=False, showAxis=False, primaryWidth=12, primarySpace=3, dpi=300, format='png', fig=None):
        """Plot transformed data in 3D"""
        del_fig = False
        if(fig is None):
            fig = plt.figure()
            del_fig = True
        else:
            plt.clf()
        if(all):
            myAXINFO = {
                'x': {'i': 0, 'tickdir': 1, 'juggled': (1, 0, 2),
                'color': (0, 0, 0, 0, 0)},
                'y': {'i': 1, 'tickdir': 0, 'juggled': (0, 1, 2),
                'color': (0, 0, 0, 0, 0)},
                'z': {'i': 2, 'tickdir': 0, 'juggled': (0, 2, 1),
                'color': (0, 0, 0, 0, 0)},
            }

            ax = fig.add_subplot(131, projection='3d')
            ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors=self.contigColours, c=self.contigColours, marker='.')
            ax.azim = 0
            ax.elev = 0
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO
            
            ax = fig.add_subplot(132, projection='3d')
            ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors=self.contigColours, c=self.contigColours, marker='.')
            ax.azim = 90
            ax.elev = 0
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO
            
            ax = fig.add_subplot(133, projection='3d')
            ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors=self.contigColours, c=self.contigColours, marker='.')
            ax.azim = 0
            ax.elev = 90
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            for axis in ax.w_xaxis, ax.w_yaxis, ax.w_zaxis:
                for elt in axis.get_ticklines() + axis.get_ticklabels():
                    elt.set_visible(False)
            ax.w_xaxis._AXINFO = myAXINFO
            ax.w_yaxis._AXINFO = myAXINFO
            ax.w_zaxis._AXINFO = myAXINFO
        else:
            ax = fig.add_subplot(111, projection='3d')
            ax.scatter(self.transformedCP[:,0], self.transformedCP[:,1], self.transformedCP[:,2], edgecolors='none', c=self.contigColours, s=2, marker='.')
            ax.azim = azim
            ax.elev = elev
            ax.set_xlim3d(0,self.scaleFactor)
            ax.set_ylim3d(0,self.scaleFactor)
            ax.set_zlim3d(0,self.scaleFactor)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_zticklabels([])
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_zticks([])
            if(not showAxis):
                ax.set_axis_off()

        if(fileName != ""):
            try:
                if(all):
                    fig.set_size_inches(3*primaryWidth+2*primarySpace,primaryWidth)
                else:
                    fig.set_size_inches(primaryWidth,primaryWidth)            
                plt.savefig(fileName,dpi=dpi,format=format)
            except:
                print "Error saving image",fileName, exc_info()[0]
                raise
        elif(show):
            try:
                plt.show()
            except:
                print "Error showing image", exc_info()[0]
                raise
        if del_fig:
            plt.close(fig)
            del fig