Exemplo n.º 1
0
    def extractContigs(self,
                       timer,
                       bids=[],
                       fasta=[],
                       prefix='',
                       cutoff=0):
        """Extract contigs and write to file"""
        
        if prefix is None or prefix == '':
            prefix=os.path.basename(self.dbFileName) \
                            .replace(".gm", "") \
                            .replace(".sm", "")
                            
        profile = self.loadProfile(timer, bids, cutoff)
        bm = BinManager(profile)
        
        # load all the contigs which have been assigned to bins
        cp = ContigParser()
        # contigs looks like cid->seq
        contigs = {}
        import mimetypes
        try:
            for file_name in fasta:
                gm_open = open
                try:
                    # handle gzipped files
                    mime = mimetypes.guess_type(file_name)
                    if mime[1] == 'gzip':
                        import gzip
                        gm_open = gzip.open
                except:
                    print "Error when guessing contig file mimetype"
                    raise
                with gm_open(file_name, "r") as f:
                    cp.getWantedSeqs(f, profile.contigNames, out_dict=contigs)
        except:
            print "Could not parse contig file:",fasta[0],sys.exc_info()[0]
            raise

        # now print out the sequences
        print "Writing files"
        for bid in bm.getBids():
            file_name = os.path.join(self._outDir, "%s_bin_%d.fna" % (prefix, bid))
            try:
                with open(file_name, 'w') as f:
                    for cid in bm.profile.contigNames[bm.getBinIndices(bid)]:
                        if(cid in contigs):
                            f.write(">%s\n%s\n" % (cid, contigs[cid]))
                        else:
                            print "These are not the contigs you're looking for. ( %s )" % (cid)
            except:
                print "Could not open file for writing:",file_name,sys.exc_info()[0]
                raise
Exemplo n.º 2
0
    def extractReads(self,
                     timer,
                     bids=[],
                     bams=[],
                     prefix="",
                     mixBams=False,
                     mixGroups=False,
                     mixReads=False,
                     interleaved=False,
                     bigFile=False,
                     headersOnly=False,
                     minMapQual=0,
                     maxMisMatches=1000,
                     useSuppAlignments=False,
                     useSecondaryAlignments=False,
                     threads=1,
                     verbose=False):
        """Extract reads from bam files and write to file

        All logic is handled by BamM <- soon to be wrapped by StoreM"""
        # load data
        profile = self.loadProfile(timer, bids)
        bm = BinManager(profile) # bins

        print "Extracting reads"

        # work out a set of targets to pass to the parser
        targets = []
        group_names = []
        for bid in bm.getBids():
            group_names.append("BIN_%d" % bid)
            row_indices = bm.getBinIndices(bid)
            targets.append(list(bm.profile.contigNames[row_indices]))

        # get something to parse the bams with
        bam_parser = BMBE(targets,
                          bams,
                          groupNames=group_names,
                          prefix=prefix,
                          outFolder=self._outDir,
                          mixBams=mixBams,
                          mixGroups=mixGroups,
                          mixReads=mixReads,
                          interleaved=interleaved,
                          bigFile=bigFile,
                          headersOnly=headersOnly,
                          minMapQual=minMapQual,
                          maxMisMatches=maxMisMatches,
                          useSuppAlignments=useSuppAlignments,
                          useSecondaryAlignments=useSecondaryAlignments)

        bam_parser.extract(threads=threads,
                           verbose=verbose)
Exemplo n.º 3
0
    def extractReads(self,
                     timer,
                     bids=[],
                     bams=[],
                     prefix="",
                     mixBams=False,
                     mixGroups=False,
                     mixReads=False,
                     interleaved=False,
                     bigFile=False,
                     headersOnly=False,
                     minMapQual=0,
                     maxMisMatches=1000,
                     useSuppAlignments=False,
                     useSecondaryAlignments=False,
                     threads=1,
                     verbose=False):
        """Extract reads from bam files and write to file

        All logic is handled by BamM <- soon to be wrapped by StoreM"""
        # load data
        profile = self.loadProfile(timer, bids)
        bm = BinManager(profile)  # bins

        print "Extracting reads"

        # work out a set of targets to pass to the parser
        targets = []
        group_names = []
        for bid in bm.getBids():
            group_names.append("BIN_%d" % bid)
            row_indices = bm.getBinIndices(bid)
            targets.append(list(bm.profile.contigNames[row_indices]))

        # get something to parse the bams with
        bam_parser = BMBE(targets,
                          bams,
                          groupNames=group_names,
                          prefix=prefix,
                          outFolder=self._outDir,
                          mixBams=mixBams,
                          mixGroups=mixGroups,
                          mixReads=mixReads,
                          interleaved=interleaved,
                          bigFile=bigFile,
                          headersOnly=headersOnly,
                          minMapQual=minMapQual,
                          maxMisMatches=maxMisMatches,
                          useSuppAlignments=useSuppAlignments,
                          useSecondaryAlignments=useSecondaryAlignments)

        bam_parser.extract(threads=threads, verbose=verbose)
Exemplo n.º 4
0
    def extractContigs(self, timer, bids=[], fasta=[], prefix='', cutoff=0):
        """Extract contigs and write to file"""

        if prefix is None or prefix == '':
            prefix=os.path.basename(self.dbFileName) \
                            .replace(".gm", "") \
                            .replace(".sm", "")

        profile = self.loadProfile(timer, bids, cutoff)
        bm = BinManager(profile)

        # load all the contigs which have been assigned to bins
        cp = ContigParser()
        # contigs looks like cid->seq
        contigs = {}
        import mimetypes
        try:
            for file_name in fasta:
                gm_open = open
                try:
                    # handle gzipped files
                    mime = mimetypes.guess_type(file_name)
                    if mime[1] == 'gzip':
                        import gzip
                        gm_open = gzip.open
                except:
                    print "Error when guessing contig file mimetype"
                    raise
                with gm_open(file_name, "r") as f:
                    cp.getWantedSeqs(f, profile.contigNames, out_dict=contigs)
        except:
            print "Could not parse contig file:", fasta[0], sys.exc_info()[0]
            raise

        # now print out the sequences
        print "Writing files"
        for bid in bm.getBids():
            file_name = os.path.join(self._outDir,
                                     "%s_bin_%d.fna" % (prefix, bid))
            try:
                with open(file_name, 'w') as f:
                    for cid in bm.profile.contigNames[bm.getBinIndices(bid)]:
                        if (cid in contigs):
                            f.write(">%s\n%s\n" % (cid, contigs[cid]))
                        else:
                            print "These are not the contigs you're looking for. ( %s )" % (
                                cid)
            except:
                print "Could not open file for writing:", file_name, sys.exc_info(
                )[0]
                raise
Exemplo n.º 5
0
    def extractMappingInfo(self,
                           timer,
                           bids=[],
                           prefix='',
                           separator='\t',
                           cutoff=0):
        """Extract markers from bins and write to file"""
        if prefix is None or prefix == '':
            prefix=os.path.basename(self.dbFileName) \
                            .replace(".gm", "") \
                            .replace(".sm", "")

        profile = self.loadProfile(timer, bids, cutoff)
        bm = BinManager(profile)
        mt = MarkerCheckTreePrinter(profile)

        # now print out the marker info
        print "Writing files"
        for bid in bm.getBids():
            file_name = os.path.join(self._outDir,
                                     "%s_bin_%d.txt" % (prefix, bid))

            bin_indices = bm.getBinIndices([bid])
            idx = np.flatnonzero(
                np.in1d(profile.mapping.rowIndices, bin_indices))

            labels = profile.mapping.markerNames[idx]
            cnames = profile.contigNames[profile.mapping.rowIndices[idx]]
            taxstrings = profile.mapping.taxstrings[idx]

            try:
                with open(file_name, 'w') as f:
                    #labels and lineages
                    f.write(
                        '#info table\n%s\n' %
                        separator.join(['label', 'taxonomy', 'contig_name']))
                    for (label, taxstring,
                         cname) in zip(labels, taxstrings, cnames):
                        f.write('%s\n' % separator.join(
                            [label, '\'%s\'' % taxstring, cname]))

                    #marker tree
                    f.write('\n#marker tree\n')
                    f.write(
                        mt.printTree(profile.mapping.rowIndices[idx],
                                     leaves_list=bin_indices))
            except:
                print "Could not open file for writing:", file_name, sys.exc_info(
                )[0]
                raise
Exemplo n.º 6
0
 def extractMappingInfo(self,
                        timer,
                        bids=[],
                        prefix='',
                        separator='\t',
                        cutoff=0
                        ):
     """Extract markers from bins and write to file"""
     if prefix is None or prefix == '':
         prefix=os.path.basename(self.dbFileName) \
                         .replace(".gm", "") \
                         .replace(".sm", "")
     
     profile = self.loadProfile(timer, bids, cutoff)
     bm = BinManager(profile)
     mt = MarkerCheckTreePrinter(profile)
     
     # now print out the marker info
     print "Writing files"
     for bid in bm.getBids():
         file_name = os.path.join(self._outDir, "%s_bin_%d.txt" % (prefix, bid))
         
         bin_indices = bm.getBinIndices([bid])
         idx = np.flatnonzero(np.in1d(profile.mapping.rowIndices, bin_indices))
         
         labels = profile.mapping.markerNames[idx]
         cnames = profile.contigNames[profile.mapping.rowIndices[idx]]
         taxstrings = profile.mapping.taxstrings[idx]
         
         try:
             with open(file_name, 'w') as f:
                 #labels and lineages
                 f.write('#info table\n%s\n' % separator.join(['label', 'taxonomy', 'contig_name']))
                 for (label, taxstring, cname) in zip(labels, taxstrings, cnames):
                     f.write('%s\n' % separator.join([label, '\'%s\'' % taxstring, cname]))
                 
                 #marker tree
                 f.write('\n#marker tree\n')
                 f.write(mt.printTree(profile.mapping.rowIndices[idx], leaves_list=bin_indices))
         except:
             print "Could not open file for writing:",file_name,sys.exc_info()[0]
             raise
Exemplo n.º 7
0
    def __init__(self, dbFileName, plot=False, force=False, numImgMaps=1):
        # worker classes
        self.PM = ProfileManager(dbFileName)  # store our data
        self.BM = BinManager(pm=self.PM)  # store our bins

        # heat maps
        self.numImgMaps = numImgMaps
        self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
        self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))

        # we need a way to reference from the imageMaps back onto the transformed data
        self.im2RowIndicies = {}

        # When blurring the raw image maps I chose a radius to suit my data, you can vary this as you like
        self.blurRadius = 2
        self.span = 30  # amount we can travel about when determining "hot spots"

        # misc
        self.minSize = 10  # Min number of contigs for a bin to be considered legit
        self.minVol = 1000000  # Override on the min size, if we have this many BP
        self.forceWriting = force
        self.debugPlots = plot
        self.imageCounter = 1  # when we print many images
        self.roundNumber = 0  # how many times have we tried to make a bin?
Exemplo n.º 8
0
class ClusterEngine:
    """Top level interface for clustering contigs"""

    def __init__(self, dbFileName, plot=False, force=False, numImgMaps=1):
        # worker classes
        self.PM = ProfileManager(dbFileName)  # store our data
        self.BM = BinManager(pm=self.PM)  # store our bins

        # heat maps
        self.numImgMaps = numImgMaps
        self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
        self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))

        # we need a way to reference from the imageMaps back onto the transformed data
        self.im2RowIndicies = {}

        # When blurring the raw image maps I chose a radius to suit my data, you can vary this as you like
        self.blurRadius = 2
        self.span = 30  # amount we can travel about when determining "hot spots"

        # misc
        self.minSize = 10  # Min number of contigs for a bin to be considered legit
        self.minVol = 1000000  # Override on the min size, if we have this many BP
        self.forceWriting = force
        self.debugPlots = plot
        self.imageCounter = 1  # when we print many images
        self.roundNumber = 0  # how many times have we tried to make a bin?

    def promptOnOverwrite(self, minimal=False):
        """Check that the user is ok with possibly overwriting the DB"""
        if self.PM.isClustered():
            if not self.forceWriting:
                input_not_ok = True
                valid_responses = ["Y", "N"]
                vrs = ",".join([str.lower(str(x)) for x in valid_responses])
                while input_not_ok:
                    if minimal:
                        option = raw_input(" Overwrite? (" + vrs + ") : ")
                    else:
                        option = raw_input(
                            " ****WARNING**** Database: '" + self.PM.dbFileName + "' has already been clustered.\n"
                            " If you continue you *MAY* overwrite existing bins!\n"
                            " Overwrite? (" + vrs + ") : "
                        )
                    if option.upper() in valid_responses:
                        print "****************************************************************"
                        if option.upper() == "N":
                            print "Operation cancelled"
                            return False
                        else:
                            break
                    else:
                        print "Error, unrecognised choice '" + option.upper() + "'"
                        minimal = True
            print "Overwriting database", self.PM.dbFileName
            self.PM.dataManager.nukeBins(self.PM.dbFileName)
        return True

    # ------------------------------------------------------------------------------
    # CORE CONSTRUCTION AND MANAGEMENT

    def makeCores(self, coreCut, minSize, minVol):
        """Cluster the contigs to make bin cores"""
        # check that the user is OK with nuking stuff...
        if not self.promptOnOverwrite():
            return False

        self.minVol = minVol
        self.minSize = minSize

        # get some data
        timer = gtime.TimeKeeper()
        self.PM.loadData(condition="length >= " + str(coreCut))
        print "    %s" % timer.getTimeStamp()

        # transform the data
        print "Apply data transformations"
        self.PM.transformCP()
        # plot the transformed space (if we've been asked to...)
        if self.debugPlots:
            self.PM.renderTransCPData()
        print "    %s" % timer.getTimeStamp()

        # cluster and bin!
        print "Create cores"
        cum_contigs_used_good = self.initialiseCores()
        print "    %s" % timer.getTimeStamp()

        # condense cores
        print "Refine cores [begin: %d]" % len(self.BM.bins)
        self.BM.autoRefineBins(iterate=True)
        num_binned = len(self.PM.binnedRowIndicies.keys())
        perc = "%.2f" % round((float(num_binned) / float(self.PM.numContigs)) * 100, 2)
        print "   ", num_binned, "contigs across", len(self.BM.bins.keys()), "cores (", perc, "% )"
        print "    %s" % timer.getTimeStamp()

        # Now save all the stuff to disk!
        print "Saving bins"
        self.BM.saveBins()
        print "    %s" % timer.getTimeStamp()

    def initialiseCores(self):
        """Process contigs and form CORE bins"""
        num_below_cutoff = 0  # how many consecutive attempts have produced small bins
        breakout_point = 100  # how many will we allow before we stop this loop

        # First we need to find the centers of each blob.
        # We can make a heat map and look for hot spots
        self.populateImageMaps()
        sub_counter = 0
        print "     .... .... .... .... .... .... .... .... .... ...."
        print "%4d" % sub_counter,
        new_line_counter = 0
        num_bins = 0
        ss = 0
        while num_below_cutoff < breakout_point:
            # if(num_bins > 70):
            #    break
            stdout.flush()
            # apply a gaussian blur to each image map to make hot spots
            # stand out more from the background
            self.blurMaps()

            # now search for the "hottest" spots on the blurred map
            # and check for possible bin centroids
            ss += 200
            putative_clusters = self.findNewClusterCenters(ss=ss)
            if putative_clusters is None:
                break
            else:
                bids_made = []
                partitions = putative_clusters[0]
                [max_blur_value, max_x, max_y] = putative_clusters[1]
                self.roundNumber += 1
                sub_round_number = 1
                for center_row_indices in partitions:
                    total_BP = sum([self.PM.contigLengths[i] for i in center_row_indices])
                    num_contigs = len(center_row_indices)
                    bin_size = num_contigs
                    # MM__print "Round: %d tBP: %d tC: %d" % (sub_round_number, total_BP, num_contigs)
                    if self.isGoodBin(total_BP, num_contigs, ms=5):  # Can we trust very small bins?.
                        # time to make a bin
                        bin = self.BM.makeNewBin(rowIndices=center_row_indices)
                        # MM__print "NEW:", total_BP, len(center_row_indices)
                        # work out the distribution in points in this bin
                        bin.makeBinDist(
                            self.PM.transformedCP, self.PM.averageCoverages, self.PM.kmerVals, self.PM.contigLengths
                        )

                        # Plot?
                        if self.debugPlots:
                            bin.plotBin(
                                self.PM.transformedCP,
                                self.PM.contigColours,
                                self.PM.kmerVals,
                                fileName="Image_" + str(self.imageCounter),
                            )
                            self.imageCounter += 1

                        # recruit more contigs
                        bin_size = bin.recruit(
                            self.PM.transformedCP,
                            self.PM.averageCoverages,
                            self.PM.kmerVals,
                            self.PM.contigLengths,
                            self.im2RowIndicies,
                            self.PM.binnedRowIndicies,
                            self.PM.restrictedRowIndicies,
                        )

                        if self.debugPlots:
                            self.plotHeat(
                                "HM_%d.%d.png" % (self.roundNumber, sub_round_number),
                                max=max_blur_value,
                                x=max_x,
                                y=max_y,
                            )
                            sub_round_number += 1

                        if self.isGoodBin(self, bin.totalBP, bin_size):
                            # Plot?
                            bids_made.append(bin.id)
                            num_bins += 1
                            if self.debugPlots:
                                bin.plotBin(
                                    self.PM.transformedCP,
                                    self.PM.contigColours,
                                    self.PM.kmerVals,
                                    fileName="P_BIN_%d" % (bin.id),
                                )

                            # append this bins list of mapped rowIndices to the main list
                            self.updatePostBin(bin)
                            num_below_cutoff = 0
                            new_line_counter += 1
                            print "% 4d" % bin_size,
                        else:
                            # we just throw these indices away for now
                            self.restrictRowIndicies(bin.rowIndices)
                            self.BM.deleteBins([bin.id], force=True)
                            new_line_counter += 1
                            num_below_cutoff += 1
                            print str(bin_size).rjust(4, "X"),

                    else:
                        # this partition was too small, restrict these guys we don't run across them again
                        self.restrictRowIndicies(center_row_indices)
                        num_below_cutoff += 1
                        # new_line_counter += 1
                        # print center_row_indices
                        # print str(bin_size).rjust(4,'Y'),

                    # make the printing prettier
                    if new_line_counter > 9:
                        new_line_counter = 0
                        sub_counter += 10
                        print "\n%4d" % sub_counter,

                # did we do anything?
                num_bids_made = len(bids_made)
                if num_bids_made == 0:
                    # nuke the lot!
                    for row_indices in partitions:
                        self.restrictRowIndicies(row_indices)

        print "\n     .... .... .... .... .... .... .... .... .... ...."

        # now we need to update the PM's binIds
        bids = self.BM.getBids()
        for bid in bids:
            for row_index in self.BM.bins[bid].rowIndices:
                self.PM.binIds[row_index] = bid

    def isGoodBin(self, totalBP, binSize, ms=0):
        """Does this bin meet my exacting requirements?"""
        if ms == 0:
            ms = self.minSize  # let the user choose
        if totalBP < self.minVol:  # less than the good volume
            if binSize > ms:  # but has enough contigs
                return True
        else:  # contains enough bp to pass regardless of number of contigs
            return True
        return False

    def findNewClusterCenters(self, ss=0):
        """Find a putative cluster"""

        inRange = lambda x, l, u: x >= l and x < u

        # we work from the top view as this has the base clustering
        max_index = np_argmax(self.blurredMaps[0])
        max_value = self.blurredMaps[0].ravel()[max_index]

        max_x = int(max_index / self.PM.scaleFactor)
        max_y = max_index - self.PM.scaleFactor * max_x
        max_z = -1

        ret_values = [max_value, max_x, max_y]

        start_span = int(1.5 * self.span)
        span_len = 2 * start_span + 1

        if self.debugPlots:
            self.plotRegion(max_x, max_y, max_z, fileName="Image_" + str(self.imageCounter), tag="column", column=True)
            self.imageCounter += 1

        # make a 3d grid to hold the values
        working_block = np_zeros((span_len, span_len, self.PM.scaleFactor))

        # go through the entire column
        (x_lower, x_upper) = self.makeCoordRanges(max_x, start_span)
        (y_lower, y_upper) = self.makeCoordRanges(max_y, start_span)
        super_putative_row_indices = []
        for p in self.im2RowIndicies:
            if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper):
                for row_index in self.im2RowIndicies[p]:
                    # check that the point is real and that it has not yet been binned
                    if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies:
                        # this is an unassigned point.
                        multiplier = np_log10(self.PM.contigLengths[row_index])
                        self.incrementAboutPoint3D(
                            working_block, p[0] - x_lower, p[1] - y_lower, p[2], multiplier=multiplier
                        )
                        super_putative_row_indices.append(row_index)

        # blur and find the highest value
        bwb = ndi.gaussian_filter(working_block, 8)  # self.blurRadius)
        densest_index = np_unravel_index(np_argmax(bwb), (np_shape(bwb)))
        max_x = densest_index[0] + x_lower
        max_y = densest_index[1] + y_lower
        max_z = densest_index[2]

        # now get the basic color of this dense point
        putative_center_row_indices = []

        (x_lower, x_upper) = self.makeCoordRanges(max_x, self.span)
        (y_lower, y_upper) = self.makeCoordRanges(max_y, self.span)
        (z_lower, z_upper) = self.makeCoordRanges(max_z, 2 * self.span)

        for row_index in super_putative_row_indices:
            p = np_around(self.PM.transformedCP[row_index])
            if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper) and inRange(p[2], z_lower, z_upper):
                # we are within the range!
                putative_center_row_indices.append(row_index)

        # make sure we have something to go on here
        if np_size(putative_center_row_indices) == 0:
            # it's all over!
            return None

        if np_size(putative_center_row_indices) == 1:
            # get out of here but keep trying
            # the calling function may restrict these indices
            return [[np_array(putative_center_row_indices)], ret_values]
        else:
            total_BP = sum([self.PM.contigLengths[i] for i in putative_center_row_indices])
            if not self.isGoodBin(total_BP, len(putative_center_row_indices), ms=5):  # Can we trust very small bins?.
                # get out of here but keep trying
                # the calling function should restrict these indices
                return [[np_array(putative_center_row_indices)], ret_values]
            else:
                # we've got a few good guys here, partition them up!
                # shift these guys around a bit
                center_k_vals = np_array([self.PM.kmerVals[i] for i in putative_center_row_indices])
                k_partitions = self.partitionVals(center_k_vals)

                if len(k_partitions) == 0:
                    return None
                else:
                    center_c_vals = np_array([self.PM.transformedCP[i][-1] for i in putative_center_row_indices])
                    # center_c_vals = np_array([self.PM.averageCoverages[i] for i in putative_center_row_indices])
                    center_c_vals -= np_min(center_c_vals)
                    c_max = np_max(center_c_vals)
                    if c_max != 0:
                        center_c_vals /= c_max
                    c_partitions = self.partitionVals(center_c_vals)

                    # take the intersection of the two partitions
                    tmp_partition_hash_1 = {}
                    id = 1
                    for p in k_partitions:
                        for i in p:
                            tmp_partition_hash_1[i] = id
                        id += 1

                    tmp_partition_hash_2 = {}
                    id = 1
                    for p in c_partitions:
                        for i in p:
                            try:
                                tmp_partition_hash_2[(tmp_partition_hash_1[i], id)].append(i)
                            except KeyError:
                                tmp_partition_hash_2[(tmp_partition_hash_1[i], id)] = [i]
                        id += 1

                    partitions = [
                        np_array([putative_center_row_indices[i] for i in tmp_partition_hash_2[key]])
                        for key in tmp_partition_hash_2.keys()
                    ]

                    # pcs = [[self.PM.averageCoverages[i] for i in p] for p in partitions]
                    # print pcs
                    return [partitions, ret_values]

    def expandSelection(self, startIndex, vals, stdevCutoff=0.05, maxSpread=0.1):
        """Expand a selection left and right from a staring index in a list of values
        
        Keep expanding unless the stdev of the values goes above the cutoff
        Return a list of indices into the original list
        """
        ret_list = [startIndex]  # this is what we will give back
        start_val = vals[startIndex]
        value_store = [start_val]

        sorted_indices = np_argsort(vals)
        max_index = len(vals)

        # set the upper and lower to point to the position
        # where the start resides
        lower_index = 0
        upper_index = 0
        for i in range(max_index):
            if sorted_indices[i] == startIndex:
                break
            lower_index += 1
            upper_index += 1
        do_lower = True
        do_upper = True
        max_index -= 1

        while do_lower or do_upper:
            if do_lower:
                do_lower = False
                if lower_index > 0:
                    try_val = vals[sorted_indices[lower_index - 1]]
                    if np_abs(try_val - start_val) < maxSpread:
                        try_array = value_store + [try_val]
                        if np_std(try_array) < stdevCutoff:
                            value_store = try_array
                            lower_index -= 1
                            ret_list.append(sorted_indices[lower_index])
                            do_lower = True
            if do_upper:
                do_upper = False
                if upper_index < max_index:
                    try_val = vals[sorted_indices[upper_index + 1]]
                    if np_abs(try_val - start_val) < maxSpread:
                        try_array = value_store + [try_val]
                        if np_std(try_array) < stdevCutoff:
                            value_store = try_array
                            upper_index += 1
                            ret_list.append(sorted_indices[upper_index])
                            do_upper = True
        return sorted(ret_list)

    def partitionVals(self, vals, stdevCutoff=0.04, maxSpread=0.15):
        """Work out where shifts in kmer/coverage vals happen"""
        partitions = []
        working_list = list(vals)
        fix_dict = dict(zip(range(len(working_list)), range(len(working_list))))
        while len(working_list) > 2:
            cf = CenterFinder()
            c_index = cf.findArrayCenter(working_list)
            expanded_indices = self.expandSelection(c_index, working_list, stdevCutoff=stdevCutoff, maxSpread=maxSpread)
            # fix any munges from previous deletes
            morphed_indices = [fix_dict[i] for i in expanded_indices]
            partitions.append(morphed_indices)
            # shunt the indices to remove down!
            shunted_indices = []
            for offset, index in enumerate(expanded_indices):
                shunted_indices.append(index - offset)

            # print "FD:", fix_dict
            # print "EI:", expanded_indices
            # print "MI:", morphed_indices
            # print "SI:", shunted_indices

            # make an updated working list and fix the fix dict
            nwl = []
            nfd = {}
            shifter = 0
            for i in range(len(working_list) - len(shunted_indices)):
                # print "================="
                if len(shunted_indices) > 0:
                    # print i, shunted_indices[0], shifter
                    if i >= shunted_indices[0]:
                        tmp = shunted_indices.pop(0)
                        shifter += 1
                        # consume any and all conseqs
                        while len(shunted_indices) > 0:
                            if shunted_indices[0] == tmp:
                                shunted_indices.pop(0)
                                shifter += 1
                            else:
                                break
                # else:
                #    print i, "_", shifter

                nfd[i] = fix_dict[i + shifter]
                nwl.append(working_list[i + shifter])

                # print nfd
                # print nwl

            fix_dict = nfd
            working_list = nwl

        if len(working_list) > 0:
            partitions.append(fix_dict.values())
        return partitions

    # ------------------------------------------------------------------------------
    # CORE MANAGEMENT

    def condenseCores(self, auto=False):
        """Itterative wrapper for the BinManager method"""
        condensing_round = 0
        num_cores_condensed = 0
        while True:  # do while loop anyone?
            condensing_round += 1
            (num_cores_condensed, continue_merge) = self.BM.condenseBins(verbose=True, auto=auto)
            if num_cores_condensed == 0:
                break
            else:
                print "    Core condensing round:", condensing_round, "Incorporated", num_cores_condensed, "cores into larger cores"

        num_binned = len(self.PM.binnedRowIndicies.keys())
        perc = "%.2f" % round((float(num_binned) / float(self.PM.numContigs)) * 100, 2)
        print "   ", num_binned, "contigs are distributed across", len(self.BM.bins.keys()), "cores (", perc, "% )"

        return

    def removeOutliersWrapper(self, mode="kmer"):
        """remove the outliers for all bins"""
        print "    Removing outliers"
        for bid in self.BM.bins:
            self.removeOutliers(bid, mode=mode)

    def removeOutliers(self, bid, fixBinnedRI=True, mode="kmer"):
        """remove outliers for a single bin"""
        dead_row_indices = self.BM.bins[bid].findOutliers(self.PM.transformedCP, self.PM.kmerVals, mode=mode)
        if len(dead_row_indices) > 0:
            if fixBinnedRI:
                for row_index in dead_row_indices:
                    self.setRowIndexUnassigned(row_index)
            self.BM.bins[bid].purge(
                dead_row_indices,
                self.PM.transformedCP,
                self.PM.averageCoverages,
                self.PM.kmerVals,
                self.PM.contigLengths,
                self.PM.kmerVals,
            )

    # ------------------------------------------------------------------------------
    # DATA MAP MANAGEMENT

    def populateImageMaps(self):
        """Load the transformed data into the main image maps"""
        # reset these guys... JIC
        self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
        self.im2RowIndicies = {}

        # add to the grid wherever we find a contig
        row_index = -1
        for point in np_around(self.PM.transformedCP):
            row_index += 1
            # can only bin things once!
            if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies:
                # add to the row_index dict so we can relate the
                # map back to individual points later
                p = tuple(point)
                if p in self.im2RowIndicies:
                    self.im2RowIndicies[p].append(row_index)
                else:
                    self.im2RowIndicies[p] = [row_index]

                # now increment in the grid
                # for each point we encounter we incrmement
                # it's position + the positions to each side
                # and touching each corner
                self.incrementViaRowIndex(row_index, p)

    def incrementViaRowIndex(self, rowIndex, point=None):
        """Wrapper to increment about point"""
        if point is None:
            point = tuple(np_around(self.PM.transformedCP[rowIndex]))
        # px = point[0]
        # py = point[1]
        # pz = point[2]
        multiplier = np_log10(self.PM.contigLengths[rowIndex])
        self.incrementAboutPoint(0, point[0], point[1], multiplier=multiplier)
        if self.numImgMaps > 1:
            self.incrementAboutPoint(1, self.PM.scaleFactor - point[2] - 1, point[1], multiplier=multiplier)
            self.incrementAboutPoint(
                2, self.PM.scaleFactor - point[2] - 1, self.PM.scaleFactor - point[0] - 1, multiplier=multiplier
            )

    def decrementViaRowIndex(self, rowIndex, point=None):
        """Wrapper to decrement about point"""
        if point is None:
            point = tuple(np_around(self.PM.transformedCP[rowIndex]))
        # px = point[0]
        # py = point[1]
        # pz = point[2]
        multiplier = np_log10(self.PM.contigLengths[rowIndex])
        self.decrementAboutPoint(0, point[0], point[1], multiplier=multiplier)
        if self.numImgMaps > 1:
            self.decrementAboutPoint(1, self.PM.scaleFactor - point[2] - 1, point[1], multiplier=multiplier)
            self.decrementAboutPoint(
                2, self.PM.scaleFactor - point[2] - 1, self.PM.scaleFactor - point[0] - 1, multiplier=multiplier
            )

    def incrementAboutPoint(self, view_index, px, py, valP=1, valS=0.6, valC=0.2, multiplier=1):
        """Increment value at a point in the 2D image maps
        
        Increment point by valP, increment neighbouring points at the
        sides and corners of the target point by valS and valC
        
        multiplier is proportional to the contigs length
        """
        valP *= multiplier
        valS *= multiplier
        valC *= multiplier
        if px > 0:
            if py > 0:
                self.imageMaps[view_index, px - 1, py - 1] += valC  # Top left corner
            self.imageMaps[view_index, px - 1, py] += valS  # Top
            if py < self.PM.scaleFactor - 1:
                self.imageMaps[view_index, px - 1, py + 1] += valC  # Top right corner

        if py > 0:
            self.imageMaps[view_index, px, py - 1] += valS  # Left side
        self.imageMaps[view_index, px, py] += valP  # Point
        if py < self.PM.scaleFactor - 1:
            self.imageMaps[view_index, px, py + 1] += valS  # Right side

        if px < self.PM.scaleFactor - 1:
            if py > 0:
                self.imageMaps[view_index, px + 1, py - 1] += valC  # Bottom left corner
            self.imageMaps[view_index, px + 1, py] += valS  # Bottom
            if py < self.PM.scaleFactor - 1:
                self.imageMaps[view_index, px + 1, py + 1] += valC  # Bottom right corner

    def decrementAboutPoint(self, view_index, px, py, valP=1, valS=0.6, valC=0.2, multiplier=1):
        """Decrement value at a point in the 2D image maps
        
        multiplier is proportional to the contigs length
        """
        valP *= multiplier
        valS *= multiplier
        valC *= multiplier
        if px > 0:
            if py > 0:
                self.safeDecrement(self.imageMaps[view_index], px - 1, py - 1, valC)  # Top left corner
            self.safeDecrement(self.imageMaps[view_index], px - 1, py, valS)  # Top
            if py < self.PM.scaleFactor - 1:
                self.safeDecrement(self.imageMaps[view_index], px - 1, py + 1, valC)  # Top right corner

        if py > 0:
            self.safeDecrement(self.imageMaps[view_index], px, py - 1, valS)  # Left side
        self.safeDecrement(self.imageMaps[view_index], px, py, valP)  # Point
        if py < self.PM.scaleFactor - 1:
            self.safeDecrement(self.imageMaps[view_index], px, py + 1, valS)  # Right side

        if px < self.PM.scaleFactor - 1:
            if py > 0:
                self.safeDecrement(self.imageMaps[view_index], px + 1, py - 1, valC)  # Bottom left corner
            self.safeDecrement(self.imageMaps[view_index], px + 1, py, valS)  # Bottom
            if py < self.PM.scaleFactor - 1:
                self.safeDecrement(self.imageMaps[view_index], px + 1, py + 1, valC)  # Bottom right corner

    def safeDecrement(self, map, px, py, value):
        """Decrement a value and make sure it's not negative or something shitty"""
        map[px][py] -= value
        if map[px][py] < np_finfo(float).eps:
            map[px][py] = 0

    def incrementAboutPoint3D(self, workingBlock, px, py, pz, vals=(6.4, 4.9, 2.5, 1.6), multiplier=1):
        """Increment a point found in a 3D column
        
        used when finding the centroid of a hot area
        update the 26 points which surround the centre point
        z spans the height of the entire column, x and y have been offset to
        match the column subspace
        
        multiplier is proportional to the contigs length
        """
        valsM = [x * multiplier for x in vals]
        # top slice
        if pz < self.PM.scaleFactor - 1:
            self.subIncrement3D(workingBlock, px, py, pz + 1, valsM, 1)

        # center slice
        self.subIncrement3D(workingBlock, px, py, pz, valsM, 0)

        # bottom slice
        if pz > 0:
            self.subIncrement3D(workingBlock, px, py, pz - 1, valsM, 1)

    def subIncrement3D(self, workingBlock, px, py, pz, vals, offset):
        """AUX: Called from incrementAboutPoint3D does but one slice
        
        multiplier is proportional to the contigs length
        """
        # get the size of the working block
        shape = np_shape(workingBlock)
        if px > 0:
            if py > 0:
                workingBlock[px - 1, py - 1, pz] += vals[offset + 2]  # Top left corner
            workingBlock[px - 1, py, pz] += vals[offset + 1]  # Top
            if py < shape[1] - 1:
                workingBlock[px - 1, py + 1, pz] += vals[offset + 2]  # Top right corner

        if py > 0:
            workingBlock[px, py - 1, pz] += vals[offset + 1]  # Left side
        workingBlock[px, py, pz] += vals[offset]  # Point
        if py < shape[1] - 1:
            workingBlock[px, py + 1, pz] += vals[offset + 1]  # Right side

        if px < shape[0] - 1:
            if py > 0:
                workingBlock[px + 1, py - 1, pz] += vals[offset + 2]  # Bottom left corner
            workingBlock[px + 1, py, pz] += vals[offset + 1]  # Bottom
            if py < shape[1] - 1:
                workingBlock[px + 1, py + 1, pz] += vals[offset + 2]  # Bottom right corner

    def blurMaps(self):
        """Blur the 2D image maps"""
        self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor))
        for i in range(self.numImgMaps):  # top, front and side
            self.blurredMaps[i, :, :] = ndi.gaussian_filter(self.imageMaps[i, :, :], 8)  # self.blurRadius)

    def makeCoordRanges(self, pos, span):
        """Make search ranges which won't go out of bounds"""
        lower = pos - span
        upper = pos + span + 1
        if lower < 0:
            lower = 0
        if upper > self.PM.scaleFactor:
            upper = self.PM.scaleFactor
        return (lower, upper)

    def updatePostBin(self, bin):
        """Update data structures after assigning contigs to a new bin"""
        for row_index in bin.rowIndices:
            self.setRowIndexAssigned(row_index)

    def setRowIndexAssigned(self, rowIndex):
        """fix the data structures to indicate that rowIndex belongs to a bin
        
        Use only during initial core creation
        """
        if rowIndex not in self.PM.restrictedRowIndicies and rowIndex not in self.PM.binnedRowIndicies:
            self.PM.binnedRowIndicies[rowIndex] = True
            # now update the image map, decrement
            self.decrementViaRowIndex(rowIndex)

    def setRowIndexUnassigned(self, rowIndex):
        """fix the data structures to indicate that rowIndex no longer belongs to a bin
        
        Use only during initial core creation
        """
        if rowIndex in self.PM.restrictedRowIndicies and rowIndex not in self.PM.binnedRowIndicies:
            del self.PM.binnedRowIndicies[rowIndex]
            # now update the image map, increment
            self.incrementViaRowIndex(rowIndex)

    def restrictRowIndicies(self, indices):
        """Add these indices to the restricted list"""
        for row_index in indices:
            # check that it's not binned or already restricted
            if row_index not in self.PM.restrictedRowIndicies and row_index not in self.PM.binnedRowIndicies:
                self.PM.restrictedRowIndicies[row_index] = True
                # now update the image map, decrement
                self.decrementViaRowIndex(row_index)

    # ------------------------------------------------------------------------------
    # IO and IMAGE RENDERING

    def plotRegion(self, px, py, pz, fileName="", tag="", column=False):
        """Plot the region surrounding a point """
        import matplotlib as mpl

        disp_vals = np_array([])
        disp_cols = np_array([])
        num_points = 0
        # plot all points within span
        (z_lower, z_upper) = self.makeCoordRanges(pz, self.span)
        if column:
            z_lower = 0
            z_upper = self.PM.scaleFactor - 1

        (x_lower, x_upper) = self.makeCoordRanges(px, self.span)
        (y_lower, y_upper) = self.makeCoordRanges(py, self.span)
        for z in range(z_lower, z_upper):
            realz = self.PM.scaleFactor - z - 1
            for x in range(x_lower, x_upper):
                for y in range(y_lower, y_upper):
                    if (x, y, realz) in self.im2RowIndicies:
                        for row_index in self.im2RowIndicies[(x, y, realz)]:
                            if (
                                row_index not in self.PM.binnedRowIndicies
                                and row_index not in self.PM.restrictedRowIndicies
                            ):
                                num_points += 1
                                disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index])
                                disp_cols = np_append(disp_cols, self.PM.contigColours[row_index])

        # make a black mark at the max values
        small_span = self.span / 2
        (x_lower, x_upper) = self.makeCoordRanges(px, small_span)
        (y_lower, y_upper) = self.makeCoordRanges(py, small_span)
        (z_lower, z_upper) = self.makeCoordRanges(pz, small_span)
        for z in range(z_lower, z_upper):
            realz = self.PM.scaleFactor - z - 1
            for x in range(x_lower, x_upper):
                for y in range(y_lower, y_upper):
                    if (x, y, realz) in self.im2RowIndicies:
                        for row_index in self.im2RowIndicies[(x, y, realz)]:
                            if (
                                row_index not in self.PM.binnedRowIndicies
                                and row_index not in self.PM.restrictedRowIndicies
                            ):
                                num_points += 1
                                disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index])
                                disp_cols = np_append(disp_cols, htr(0, 0, 0))
        # reshape
        disp_vals = np_reshape(disp_vals, (num_points, 3))
        disp_cols = np_reshape(disp_cols, (num_points, 3))

        fig = plt.figure()
        ax = fig.add_subplot(111, projection="3d")
        cm = mpl.colors.LinearSegmentedColormap("my_colormap", disp_cols, 1024)
        result = ax.scatter(
            disp_vals[:, 0], disp_vals[:, 1], disp_vals[:, 2], edgecolors=disp_cols, c=disp_cols, cmap=cm, marker="."
        )
        title = str.join(" ", ["Focus at: (", str(px), str(py), str(self.PM.scaleFactor - pz - 1), ")\n", tag])
        plt.title(title)

        if fileName != "":
            fig.set_size_inches(6, 6)
            plt.savefig(fileName, dpi=300)
        elif show:
            plt.show()

        plt.close(fig)
        del fig

    def plotHeat(self, fileName="", max=-1, x=-1, y=-1):
        """Print the main heat maps
        
        Useful for debugging
        """
        fig = plt.figure()
        images = []
        ax = None
        if self.numImgMaps == 1:
            ax = fig.add_subplot(121)
            images.append(ax.imshow(self.blurredMaps[0, :, :] ** 0.5))
            if max > 0:
                title = "Max value: %f (%f, %f)" % (max, x, y)
                plt.title(title)
        else:
            ax = fig.add_subplot(231)
            images.append(ax.imshow(self.blurredMaps[0, :, :] ** 0.5))
            if max > 0:
                title = str.join(" ", ["Max value:", str(max)])
                plt.title(title)
            ax = fig.add_subplot(232)
            images.append(ax.imshow(self.blurredMaps[1, :, :] ** 0.5))
            ax = fig.add_subplot(233)
            images.append(ax.imshow(self.blurredMaps[2, :, :] ** 0.5))

        if self.numImgMaps == 1:
            ax = fig.add_subplot(122)
            images.append(ax.imshow(self.imageMaps[0, :, :] ** 0.5))
        else:
            ax = fig.add_subplot(234)
            images.append(ax.imshow(self.imageMaps[0, :, :] ** 0.5))
            ax = fig.add_subplot(235)
            images.append(ax.imshow(self.imageMaps[1, :, :] ** 0.5))
            ax = fig.add_subplot(236)
            images.append(ax.imshow(self.imageMaps[2, :, :] ** 0.5))

        if fileName != "":
            if self.numImgMaps == 1:
                fig.set_size_inches(12, 6)
            else:
                fig.set_size_inches(18, 18)

            plt.savefig(fileName, dpi=300)
        elif show:
            plt.show()

        plt.close(fig)
        del fig
Exemplo n.º 9
0
    def dumpBinStats(self,
                     timer,
                     fields,
                     outFile,
                     separator,
                     useHeaders
                    ):
        """Compute bin statistics"""
        
        # load all the contigs which have been assigned to bins
        profile = self.loadProfile(timer)
        bm = BinManager(profile)
        
        stats = bm.getBinStats()
        
        #data to output
        header_strings = []
        data_arrays = []
        data_converters = []
        
        for field in fields:
            if field == 'bins':
                header_strings.append('bid')
                data_arrays.append(stats.bids)
                data_converters.append(lambda x: str(x))
                
            elif field == 'points':
                header_strings.append('num_contigs')
                data_arrays.append(stats.numContigs)
                data_converters.append(lambda x: str(x))
            
            elif field == 'sizes':
                header_strings.append('size')
                data_arrays.append(stats.sizes)
                data_converters.append(lambda x: str(x))
                
            elif field == 'lengths':
                header_strings.append('length_min')
                header_strings.append('length_median')
                header_strings.append('length_max')
                data_arrays.append(stats.lengthRanges[:,0])
                data_arrays.append(stats.lengthMedians)
                data_arrays.append(stats.lengthRanges[:,1])
                data_converters.append(lambda x: str(x))
                data_converters.append(lambda x: str(x))
                data_converters.append(lambda x: str(x))
            
            elif field == 'gc':
                header_strings.append("GC%_mean")
                header_strings.append("GC%_std")
                data_arrays.append(stats.GCMeans)
                data_arrays.append(stats.GCStdDevs)
                data_converters.append(lambda x : "%0.4f" % x)
                data_converters.append(lambda x : "%0.4f" % x)
            
            elif field == 'coverage':
                stoits = profile.stoitNames
                header_strings.append(separator.join([separator.join([i + "_mean", i + "_std"]) for i in stoits]))
                interleaved = np.dsplit(np.transpose(np.dstack((stats.covMeans, stats.covStdDevs)), axes=[0, 2, 1]))
                data_arrays.append(interleaved)
                data_converters.append(lambda x: separator.join(["%0.4f"+separator+"%0.4f" % i for i in x]))

            elif field == 'tags':
                header_strings.append('taxonomy')
                data_arrays.append(stats.tags)
                data_converters.append(lambda x: x)
            
        # now print out the sequences
        try:
            with open(outFile, 'w') as f:
                if useHeaders:
                    header = separator.join(header_strings) + '\n'
                    f.write(header)
                
                num_rows = len(data_arrays[0])
                for i in range(num_rows):
                    row = separator.join([conv(arr[i]) for (conv, arr) in zip(data_converters, data_arrays)])
                    f.write(row+'\n')
        except:
            print "Could not open file for writing:",outFile,sys.exc_info()[0]
            raise
Exemplo n.º 10
0
    def dumpBinStats(self, timer, fields, outFile, separator, useHeaders):
        """Compute bin statistics"""

        # load all the contigs which have been assigned to bins
        profile = self.loadProfile(timer)
        bm = BinManager(profile)

        stats = bm.getBinStats()

        #data to output
        header_strings = []
        data_arrays = []
        data_converters = []

        for field in fields:
            if field == 'bins':
                header_strings.append('bid')
                data_arrays.append(stats.bids)
                data_converters.append(lambda x: str(x))

            elif field == 'points':
                header_strings.append('num_contigs')
                data_arrays.append(stats.numContigs)
                data_converters.append(lambda x: str(x))

            elif field == 'sizes':
                header_strings.append('size')
                data_arrays.append(stats.sizes)
                data_converters.append(lambda x: str(x))

            elif field == 'lengths':
                header_strings.append('length_min')
                header_strings.append('length_median')
                header_strings.append('length_max')
                data_arrays.append(stats.lengthRanges[:, 0])
                data_arrays.append(stats.lengthMedians)
                data_arrays.append(stats.lengthRanges[:, 1])
                data_converters.append(lambda x: str(x))
                data_converters.append(lambda x: str(x))
                data_converters.append(lambda x: str(x))

            elif field == 'gc':
                header_strings.append("GC%_mean")
                header_strings.append("GC%_std")
                data_arrays.append(stats.GCMeans)
                data_arrays.append(stats.GCStdDevs)
                data_converters.append(lambda x: "%0.4f" % x)
                data_converters.append(lambda x: "%0.4f" % x)

            elif field == 'coverage':
                stoits = profile.stoitNames
                header_strings.append(
                    separator.join([
                        separator.join([i + "_mean", i + "_std"])
                        for i in stoits
                    ]))
                interleaved = np.dsplit(
                    np.transpose(np.dstack((stats.covMeans, stats.covStdDevs)),
                                 axes=[0, 2, 1]))
                data_arrays.append(interleaved)
                data_converters.append(lambda x: separator.join(
                    ["%0.4f" + separator + "%0.4f" % i for i in x]))

            elif field == 'tags':
                header_strings.append('taxonomy')
                data_arrays.append(stats.tags)
                data_converters.append(lambda x: x)

        # now print out the sequences
        try:
            with open(outFile, 'w') as f:
                if useHeaders:
                    header = separator.join(header_strings) + '\n'
                    f.write(header)

                num_rows = len(data_arrays[0])
                for i in range(num_rows):
                    row = separator.join([
                        conv(arr[i])
                        for (conv, arr) in zip(data_converters, data_arrays)
                    ])
                    f.write(row + '\n')
        except:
            print "Could not open file for writing:", outFile, sys.exc_info(
            )[0]
            raise