class CoreCreator: """Top level class for making bins""" def __init__(self, dbFileName): self._pm = ProfileManager(dbFileName) self._dbFileName = dbFileName def loadProfile(self, timer, minLength): return self._pm.loadData(timer, minLength=minLength, loadMarkers=True, loadBins=False) def run(self, timer, minLength, minSize, minPts, savedDistsPrefix="", keepDists=False, force=False): # check that the user is OK with nuking stuff... if not force and not self._pm.promptOnOverwrite(): return profile = self.loadProfile(timer, minLength=minLength ) if savedDistsPrefix=="": savedDistsPrefix = self._dbFileName+".dists" cacher = FileCacher(savedDistsPrefix) ce = ClassificationClusterEngine(profile, minPts=minPts, minSize=minSize, cacher=cacher, ) ce.makeBins(timer, out_bins=profile.binIds, out_reach_order=profile.reachOrder, out_reach_dists=profile.reachDists ) # Now save all the stuff to disk! print "Saving bins" self._pm.setReachabilityOrder(profile) self._pm.setBinAssignments(profile, nuke=True) print " %s" % timer.getTimeStamp() # Remove created files if not keepDists: try: cacher.cleanup() except: raise
class CoreCreator: """Top level class for making bins""" def __init__(self, dbFileName): self._pm = ProfileManager(dbFileName) self._dbFileName = dbFileName def loadProfile(self, timer, minLength): return self._pm.loadData(timer, minLength=minLength, loadMarkers=True, loadBins=False) def run(self, timer, minLength, minSize, minPts, savedDistsPrefix="", keepDists=False, force=False): # check that the user is OK with nuking stuff... if not force and not self._pm.promptOnOverwrite(): return profile = self.loadProfile(timer, minLength=minLength) if savedDistsPrefix == "": savedDistsPrefix = self._dbFileName + ".dists" cacher = FileCacher(savedDistsPrefix) ce = ClassificationClusterEngine( profile, minPts=minPts, minSize=minSize, cacher=cacher, ) ce.makeBins(timer, out_bins=profile.binIds, out_reach_order=profile.reachOrder, out_reach_dists=profile.reachDists) # Now save all the stuff to disk! print "Saving bins" self._pm.setReachabilityOrder(profile) self._pm.setBinAssignments(profile, nuke=True) print " %s" % timer.getTimeStamp() # Remove created files if not keepDists: try: cacher.cleanup() except: raise
class BinImporter: """Used for importing bin assignments""" def __init__(self, dbFileName): self._pm = ProfileManager(dbFileName) def loadProfile(self, timer): return self._pm.loadData(timer) def importBinAssignments(self, timer, infile, separator): """Parse assignment file for bin contigs""" profile = self.loadProfile(timer) br = BinReader() # looks like cid->bid contig_bins = {} try: with open(infile, "r") as f: try: (con_names, con_bins) = br.parse(f, separator) (_, con_bid) = np.unique(con_bins, return_inverse=True) con_bid += 1 # bid zero is unbinned contig_bins = dict(zip(con_names, con_bid)) except: print "Error parsing bin assignments" raise except: print "Could not parse bin assignment file:",infile,sys.exc_info()[0] raise # now get the internal indices for contigs for (i, cid) in enumerate(profile.contigNames): try: profile.binIds[i] = contig_bins[cid] except KeyError: pass # Now save all the stuff to disk! print "Saving bins" self._pm.setBinAssignments(profile, nuke=True) print " %s" % timer.getTimeStamp()
class BinImporter: """Used for importing bin assignments""" def __init__(self, dbFileName): self._pm = ProfileManager(dbFileName) def loadProfile(self, timer): return self._pm.loadData(timer) def importBinAssignments(self, timer, infile, separator): """Parse assignment file for bin contigs""" profile = self.loadProfile(timer) br = BinReader() # looks like cid->bid contig_bins = {} try: with open(infile, "r") as f: try: (con_names, con_bins) = br.parse(f, separator) (_, con_bid) = np.unique(con_bins, return_inverse=True) con_bid += 1 # bid zero is unbinned contig_bins = dict(zip(con_names, con_bid)) except: print "Error parsing bin assignments" raise except: print "Could not parse bin assignment file:", infile, sys.exc_info( )[0] raise # now get the internal indices for contigs for (i, cid) in enumerate(profile.contigNames): try: profile.binIds[i] = contig_bins[cid] except KeyError: pass # Now save all the stuff to disk! print "Saving bins" self._pm.setBinAssignments(profile, nuke=True) print " %s" % timer.getTimeStamp()
class ClusterEngine: """Top level interface for clustering contigs""" def __init__(self, dbFileName, plot=False, force=False, numImgMaps=1): # worker classes self.PM = ProfileManager(dbFileName) # store our data self.BM = BinManager(pm=self.PM) # store our bins # heat maps self.numImgMaps = numImgMaps self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) # we need a way to reference from the imageMaps back onto the transformed data self.im2RowIndicies = {} # When blurring the raw image maps I chose a radius to suit my data, you can vary this as you like self.blurRadius = 2 self.span = 30 # amount we can travel about when determining "hot spots" # misc self.minSize = 10 # Min number of contigs for a bin to be considered legit self.minVol = 1000000 # Override on the min size, if we have this many BP self.forceWriting = force self.debugPlots = plot self.imageCounter = 1 # when we print many images self.roundNumber = 0 # how many times have we tried to make a bin? def promptOnOverwrite(self, minimal=False): """Check that the user is ok with possibly overwriting the DB""" if self.PM.isClustered(): if not self.forceWriting: input_not_ok = True valid_responses = ["Y", "N"] vrs = ",".join([str.lower(str(x)) for x in valid_responses]) while input_not_ok: if minimal: option = raw_input(" Overwrite? (" + vrs + ") : ") else: option = raw_input( " ****WARNING**** Database: '" + self.PM.dbFileName + "' has already been clustered.\n" " If you continue you *MAY* overwrite existing bins!\n" " Overwrite? (" + vrs + ") : " ) if option.upper() in valid_responses: print "****************************************************************" if option.upper() == "N": print "Operation cancelled" return False else: break else: print "Error, unrecognised choice '" + option.upper() + "'" minimal = True print "Overwriting database", self.PM.dbFileName self.PM.dataManager.nukeBins(self.PM.dbFileName) return True # ------------------------------------------------------------------------------ # CORE CONSTRUCTION AND MANAGEMENT def makeCores(self, coreCut, minSize, minVol): """Cluster the contigs to make bin cores""" # check that the user is OK with nuking stuff... if not self.promptOnOverwrite(): return False self.minVol = minVol self.minSize = minSize # get some data timer = gtime.TimeKeeper() self.PM.loadData(condition="length >= " + str(coreCut)) print " %s" % timer.getTimeStamp() # transform the data print "Apply data transformations" self.PM.transformCP() # plot the transformed space (if we've been asked to...) if self.debugPlots: self.PM.renderTransCPData() print " %s" % timer.getTimeStamp() # cluster and bin! print "Create cores" cum_contigs_used_good = self.initialiseCores() print " %s" % timer.getTimeStamp() # condense cores print "Refine cores [begin: %d]" % len(self.BM.bins) self.BM.autoRefineBins(iterate=True) num_binned = len(self.PM.binnedRowIndicies.keys()) perc = "%.2f" % round((float(num_binned) / float(self.PM.numContigs)) * 100, 2) print " ", num_binned, "contigs across", len(self.BM.bins.keys()), "cores (", perc, "% )" print " %s" % timer.getTimeStamp() # Now save all the stuff to disk! print "Saving bins" self.BM.saveBins() print " %s" % timer.getTimeStamp() def initialiseCores(self): """Process contigs and form CORE bins""" num_below_cutoff = 0 # how many consecutive attempts have produced small bins breakout_point = 100 # how many will we allow before we stop this loop # First we need to find the centers of each blob. # We can make a heat map and look for hot spots self.populateImageMaps() sub_counter = 0 print " .... .... .... .... .... .... .... .... .... ...." print "%4d" % sub_counter, new_line_counter = 0 num_bins = 0 ss = 0 while num_below_cutoff < breakout_point: # if(num_bins > 70): # break stdout.flush() # apply a gaussian blur to each image map to make hot spots # stand out more from the background self.blurMaps() # now search for the "hottest" spots on the blurred map # and check for possible bin centroids ss += 200 putative_clusters = self.findNewClusterCenters(ss=ss) if putative_clusters is None: break else: bids_made = [] partitions = putative_clusters[0] [max_blur_value, max_x, max_y] = putative_clusters[1] self.roundNumber += 1 sub_round_number = 1 for center_row_indices in partitions: total_BP = sum([self.PM.contigLengths[i] for i in center_row_indices]) num_contigs = len(center_row_indices) bin_size = num_contigs # MM__print "Round: %d tBP: %d tC: %d" % (sub_round_number, total_BP, num_contigs) if self.isGoodBin(total_BP, num_contigs, ms=5): # Can we trust very small bins?. # time to make a bin bin = self.BM.makeNewBin(rowIndices=center_row_indices) # MM__print "NEW:", total_BP, len(center_row_indices) # work out the distribution in points in this bin bin.makeBinDist( self.PM.transformedCP, self.PM.averageCoverages, self.PM.kmerVals, self.PM.contigLengths ) # Plot? if self.debugPlots: bin.plotBin( self.PM.transformedCP, self.PM.contigColours, self.PM.kmerVals, fileName="Image_" + str(self.imageCounter), ) self.imageCounter += 1 # recruit more contigs bin_size = bin.recruit( self.PM.transformedCP, self.PM.averageCoverages, self.PM.kmerVals, self.PM.contigLengths, self.im2RowIndicies, self.PM.binnedRowIndicies, self.PM.restrictedRowIndicies, ) if self.debugPlots: self.plotHeat( "HM_%d.%d.png" % (self.roundNumber, sub_round_number), max=max_blur_value, x=max_x, y=max_y, ) sub_round_number += 1 if self.isGoodBin(self, bin.totalBP, bin_size): # Plot? bids_made.append(bin.id) num_bins += 1 if self.debugPlots: bin.plotBin( self.PM.transformedCP, self.PM.contigColours, self.PM.kmerVals, fileName="P_BIN_%d" % (bin.id), ) # append this bins list of mapped rowIndices to the main list self.updatePostBin(bin) num_below_cutoff = 0 new_line_counter += 1 print "% 4d" % bin_size, else: # we just throw these indices away for now self.restrictRowIndicies(bin.rowIndices) self.BM.deleteBins([bin.id], force=True) new_line_counter += 1 num_below_cutoff += 1 print str(bin_size).rjust(4, "X"), else: # this partition was too small, restrict these guys we don't run across them again self.restrictRowIndicies(center_row_indices) num_below_cutoff += 1 # new_line_counter += 1 # print center_row_indices # print str(bin_size).rjust(4,'Y'), # make the printing prettier if new_line_counter > 9: new_line_counter = 0 sub_counter += 10 print "\n%4d" % sub_counter, # did we do anything? num_bids_made = len(bids_made) if num_bids_made == 0: # nuke the lot! for row_indices in partitions: self.restrictRowIndicies(row_indices) print "\n .... .... .... .... .... .... .... .... .... ...." # now we need to update the PM's binIds bids = self.BM.getBids() for bid in bids: for row_index in self.BM.bins[bid].rowIndices: self.PM.binIds[row_index] = bid def isGoodBin(self, totalBP, binSize, ms=0): """Does this bin meet my exacting requirements?""" if ms == 0: ms = self.minSize # let the user choose if totalBP < self.minVol: # less than the good volume if binSize > ms: # but has enough contigs return True else: # contains enough bp to pass regardless of number of contigs return True return False def findNewClusterCenters(self, ss=0): """Find a putative cluster""" inRange = lambda x, l, u: x >= l and x < u # we work from the top view as this has the base clustering max_index = np_argmax(self.blurredMaps[0]) max_value = self.blurredMaps[0].ravel()[max_index] max_x = int(max_index / self.PM.scaleFactor) max_y = max_index - self.PM.scaleFactor * max_x max_z = -1 ret_values = [max_value, max_x, max_y] start_span = int(1.5 * self.span) span_len = 2 * start_span + 1 if self.debugPlots: self.plotRegion(max_x, max_y, max_z, fileName="Image_" + str(self.imageCounter), tag="column", column=True) self.imageCounter += 1 # make a 3d grid to hold the values working_block = np_zeros((span_len, span_len, self.PM.scaleFactor)) # go through the entire column (x_lower, x_upper) = self.makeCoordRanges(max_x, start_span) (y_lower, y_upper) = self.makeCoordRanges(max_y, start_span) super_putative_row_indices = [] for p in self.im2RowIndicies: if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper): for row_index in self.im2RowIndicies[p]: # check that the point is real and that it has not yet been binned if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies: # this is an unassigned point. multiplier = np_log10(self.PM.contigLengths[row_index]) self.incrementAboutPoint3D( working_block, p[0] - x_lower, p[1] - y_lower, p[2], multiplier=multiplier ) super_putative_row_indices.append(row_index) # blur and find the highest value bwb = ndi.gaussian_filter(working_block, 8) # self.blurRadius) densest_index = np_unravel_index(np_argmax(bwb), (np_shape(bwb))) max_x = densest_index[0] + x_lower max_y = densest_index[1] + y_lower max_z = densest_index[2] # now get the basic color of this dense point putative_center_row_indices = [] (x_lower, x_upper) = self.makeCoordRanges(max_x, self.span) (y_lower, y_upper) = self.makeCoordRanges(max_y, self.span) (z_lower, z_upper) = self.makeCoordRanges(max_z, 2 * self.span) for row_index in super_putative_row_indices: p = np_around(self.PM.transformedCP[row_index]) if inRange(p[0], x_lower, x_upper) and inRange(p[1], y_lower, y_upper) and inRange(p[2], z_lower, z_upper): # we are within the range! putative_center_row_indices.append(row_index) # make sure we have something to go on here if np_size(putative_center_row_indices) == 0: # it's all over! return None if np_size(putative_center_row_indices) == 1: # get out of here but keep trying # the calling function may restrict these indices return [[np_array(putative_center_row_indices)], ret_values] else: total_BP = sum([self.PM.contigLengths[i] for i in putative_center_row_indices]) if not self.isGoodBin(total_BP, len(putative_center_row_indices), ms=5): # Can we trust very small bins?. # get out of here but keep trying # the calling function should restrict these indices return [[np_array(putative_center_row_indices)], ret_values] else: # we've got a few good guys here, partition them up! # shift these guys around a bit center_k_vals = np_array([self.PM.kmerVals[i] for i in putative_center_row_indices]) k_partitions = self.partitionVals(center_k_vals) if len(k_partitions) == 0: return None else: center_c_vals = np_array([self.PM.transformedCP[i][-1] for i in putative_center_row_indices]) # center_c_vals = np_array([self.PM.averageCoverages[i] for i in putative_center_row_indices]) center_c_vals -= np_min(center_c_vals) c_max = np_max(center_c_vals) if c_max != 0: center_c_vals /= c_max c_partitions = self.partitionVals(center_c_vals) # take the intersection of the two partitions tmp_partition_hash_1 = {} id = 1 for p in k_partitions: for i in p: tmp_partition_hash_1[i] = id id += 1 tmp_partition_hash_2 = {} id = 1 for p in c_partitions: for i in p: try: tmp_partition_hash_2[(tmp_partition_hash_1[i], id)].append(i) except KeyError: tmp_partition_hash_2[(tmp_partition_hash_1[i], id)] = [i] id += 1 partitions = [ np_array([putative_center_row_indices[i] for i in tmp_partition_hash_2[key]]) for key in tmp_partition_hash_2.keys() ] # pcs = [[self.PM.averageCoverages[i] for i in p] for p in partitions] # print pcs return [partitions, ret_values] def expandSelection(self, startIndex, vals, stdevCutoff=0.05, maxSpread=0.1): """Expand a selection left and right from a staring index in a list of values Keep expanding unless the stdev of the values goes above the cutoff Return a list of indices into the original list """ ret_list = [startIndex] # this is what we will give back start_val = vals[startIndex] value_store = [start_val] sorted_indices = np_argsort(vals) max_index = len(vals) # set the upper and lower to point to the position # where the start resides lower_index = 0 upper_index = 0 for i in range(max_index): if sorted_indices[i] == startIndex: break lower_index += 1 upper_index += 1 do_lower = True do_upper = True max_index -= 1 while do_lower or do_upper: if do_lower: do_lower = False if lower_index > 0: try_val = vals[sorted_indices[lower_index - 1]] if np_abs(try_val - start_val) < maxSpread: try_array = value_store + [try_val] if np_std(try_array) < stdevCutoff: value_store = try_array lower_index -= 1 ret_list.append(sorted_indices[lower_index]) do_lower = True if do_upper: do_upper = False if upper_index < max_index: try_val = vals[sorted_indices[upper_index + 1]] if np_abs(try_val - start_val) < maxSpread: try_array = value_store + [try_val] if np_std(try_array) < stdevCutoff: value_store = try_array upper_index += 1 ret_list.append(sorted_indices[upper_index]) do_upper = True return sorted(ret_list) def partitionVals(self, vals, stdevCutoff=0.04, maxSpread=0.15): """Work out where shifts in kmer/coverage vals happen""" partitions = [] working_list = list(vals) fix_dict = dict(zip(range(len(working_list)), range(len(working_list)))) while len(working_list) > 2: cf = CenterFinder() c_index = cf.findArrayCenter(working_list) expanded_indices = self.expandSelection(c_index, working_list, stdevCutoff=stdevCutoff, maxSpread=maxSpread) # fix any munges from previous deletes morphed_indices = [fix_dict[i] for i in expanded_indices] partitions.append(morphed_indices) # shunt the indices to remove down! shunted_indices = [] for offset, index in enumerate(expanded_indices): shunted_indices.append(index - offset) # print "FD:", fix_dict # print "EI:", expanded_indices # print "MI:", morphed_indices # print "SI:", shunted_indices # make an updated working list and fix the fix dict nwl = [] nfd = {} shifter = 0 for i in range(len(working_list) - len(shunted_indices)): # print "=================" if len(shunted_indices) > 0: # print i, shunted_indices[0], shifter if i >= shunted_indices[0]: tmp = shunted_indices.pop(0) shifter += 1 # consume any and all conseqs while len(shunted_indices) > 0: if shunted_indices[0] == tmp: shunted_indices.pop(0) shifter += 1 else: break # else: # print i, "_", shifter nfd[i] = fix_dict[i + shifter] nwl.append(working_list[i + shifter]) # print nfd # print nwl fix_dict = nfd working_list = nwl if len(working_list) > 0: partitions.append(fix_dict.values()) return partitions # ------------------------------------------------------------------------------ # CORE MANAGEMENT def condenseCores(self, auto=False): """Itterative wrapper for the BinManager method""" condensing_round = 0 num_cores_condensed = 0 while True: # do while loop anyone? condensing_round += 1 (num_cores_condensed, continue_merge) = self.BM.condenseBins(verbose=True, auto=auto) if num_cores_condensed == 0: break else: print " Core condensing round:", condensing_round, "Incorporated", num_cores_condensed, "cores into larger cores" num_binned = len(self.PM.binnedRowIndicies.keys()) perc = "%.2f" % round((float(num_binned) / float(self.PM.numContigs)) * 100, 2) print " ", num_binned, "contigs are distributed across", len(self.BM.bins.keys()), "cores (", perc, "% )" return def removeOutliersWrapper(self, mode="kmer"): """remove the outliers for all bins""" print " Removing outliers" for bid in self.BM.bins: self.removeOutliers(bid, mode=mode) def removeOutliers(self, bid, fixBinnedRI=True, mode="kmer"): """remove outliers for a single bin""" dead_row_indices = self.BM.bins[bid].findOutliers(self.PM.transformedCP, self.PM.kmerVals, mode=mode) if len(dead_row_indices) > 0: if fixBinnedRI: for row_index in dead_row_indices: self.setRowIndexUnassigned(row_index) self.BM.bins[bid].purge( dead_row_indices, self.PM.transformedCP, self.PM.averageCoverages, self.PM.kmerVals, self.PM.contigLengths, self.PM.kmerVals, ) # ------------------------------------------------------------------------------ # DATA MAP MANAGEMENT def populateImageMaps(self): """Load the transformed data into the main image maps""" # reset these guys... JIC self.imageMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) self.im2RowIndicies = {} # add to the grid wherever we find a contig row_index = -1 for point in np_around(self.PM.transformedCP): row_index += 1 # can only bin things once! if row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies: # add to the row_index dict so we can relate the # map back to individual points later p = tuple(point) if p in self.im2RowIndicies: self.im2RowIndicies[p].append(row_index) else: self.im2RowIndicies[p] = [row_index] # now increment in the grid # for each point we encounter we incrmement # it's position + the positions to each side # and touching each corner self.incrementViaRowIndex(row_index, p) def incrementViaRowIndex(self, rowIndex, point=None): """Wrapper to increment about point""" if point is None: point = tuple(np_around(self.PM.transformedCP[rowIndex])) # px = point[0] # py = point[1] # pz = point[2] multiplier = np_log10(self.PM.contigLengths[rowIndex]) self.incrementAboutPoint(0, point[0], point[1], multiplier=multiplier) if self.numImgMaps > 1: self.incrementAboutPoint(1, self.PM.scaleFactor - point[2] - 1, point[1], multiplier=multiplier) self.incrementAboutPoint( 2, self.PM.scaleFactor - point[2] - 1, self.PM.scaleFactor - point[0] - 1, multiplier=multiplier ) def decrementViaRowIndex(self, rowIndex, point=None): """Wrapper to decrement about point""" if point is None: point = tuple(np_around(self.PM.transformedCP[rowIndex])) # px = point[0] # py = point[1] # pz = point[2] multiplier = np_log10(self.PM.contigLengths[rowIndex]) self.decrementAboutPoint(0, point[0], point[1], multiplier=multiplier) if self.numImgMaps > 1: self.decrementAboutPoint(1, self.PM.scaleFactor - point[2] - 1, point[1], multiplier=multiplier) self.decrementAboutPoint( 2, self.PM.scaleFactor - point[2] - 1, self.PM.scaleFactor - point[0] - 1, multiplier=multiplier ) def incrementAboutPoint(self, view_index, px, py, valP=1, valS=0.6, valC=0.2, multiplier=1): """Increment value at a point in the 2D image maps Increment point by valP, increment neighbouring points at the sides and corners of the target point by valS and valC multiplier is proportional to the contigs length """ valP *= multiplier valS *= multiplier valC *= multiplier if px > 0: if py > 0: self.imageMaps[view_index, px - 1, py - 1] += valC # Top left corner self.imageMaps[view_index, px - 1, py] += valS # Top if py < self.PM.scaleFactor - 1: self.imageMaps[view_index, px - 1, py + 1] += valC # Top right corner if py > 0: self.imageMaps[view_index, px, py - 1] += valS # Left side self.imageMaps[view_index, px, py] += valP # Point if py < self.PM.scaleFactor - 1: self.imageMaps[view_index, px, py + 1] += valS # Right side if px < self.PM.scaleFactor - 1: if py > 0: self.imageMaps[view_index, px + 1, py - 1] += valC # Bottom left corner self.imageMaps[view_index, px + 1, py] += valS # Bottom if py < self.PM.scaleFactor - 1: self.imageMaps[view_index, px + 1, py + 1] += valC # Bottom right corner def decrementAboutPoint(self, view_index, px, py, valP=1, valS=0.6, valC=0.2, multiplier=1): """Decrement value at a point in the 2D image maps multiplier is proportional to the contigs length """ valP *= multiplier valS *= multiplier valC *= multiplier if px > 0: if py > 0: self.safeDecrement(self.imageMaps[view_index], px - 1, py - 1, valC) # Top left corner self.safeDecrement(self.imageMaps[view_index], px - 1, py, valS) # Top if py < self.PM.scaleFactor - 1: self.safeDecrement(self.imageMaps[view_index], px - 1, py + 1, valC) # Top right corner if py > 0: self.safeDecrement(self.imageMaps[view_index], px, py - 1, valS) # Left side self.safeDecrement(self.imageMaps[view_index], px, py, valP) # Point if py < self.PM.scaleFactor - 1: self.safeDecrement(self.imageMaps[view_index], px, py + 1, valS) # Right side if px < self.PM.scaleFactor - 1: if py > 0: self.safeDecrement(self.imageMaps[view_index], px + 1, py - 1, valC) # Bottom left corner self.safeDecrement(self.imageMaps[view_index], px + 1, py, valS) # Bottom if py < self.PM.scaleFactor - 1: self.safeDecrement(self.imageMaps[view_index], px + 1, py + 1, valC) # Bottom right corner def safeDecrement(self, map, px, py, value): """Decrement a value and make sure it's not negative or something shitty""" map[px][py] -= value if map[px][py] < np_finfo(float).eps: map[px][py] = 0 def incrementAboutPoint3D(self, workingBlock, px, py, pz, vals=(6.4, 4.9, 2.5, 1.6), multiplier=1): """Increment a point found in a 3D column used when finding the centroid of a hot area update the 26 points which surround the centre point z spans the height of the entire column, x and y have been offset to match the column subspace multiplier is proportional to the contigs length """ valsM = [x * multiplier for x in vals] # top slice if pz < self.PM.scaleFactor - 1: self.subIncrement3D(workingBlock, px, py, pz + 1, valsM, 1) # center slice self.subIncrement3D(workingBlock, px, py, pz, valsM, 0) # bottom slice if pz > 0: self.subIncrement3D(workingBlock, px, py, pz - 1, valsM, 1) def subIncrement3D(self, workingBlock, px, py, pz, vals, offset): """AUX: Called from incrementAboutPoint3D does but one slice multiplier is proportional to the contigs length """ # get the size of the working block shape = np_shape(workingBlock) if px > 0: if py > 0: workingBlock[px - 1, py - 1, pz] += vals[offset + 2] # Top left corner workingBlock[px - 1, py, pz] += vals[offset + 1] # Top if py < shape[1] - 1: workingBlock[px - 1, py + 1, pz] += vals[offset + 2] # Top right corner if py > 0: workingBlock[px, py - 1, pz] += vals[offset + 1] # Left side workingBlock[px, py, pz] += vals[offset] # Point if py < shape[1] - 1: workingBlock[px, py + 1, pz] += vals[offset + 1] # Right side if px < shape[0] - 1: if py > 0: workingBlock[px + 1, py - 1, pz] += vals[offset + 2] # Bottom left corner workingBlock[px + 1, py, pz] += vals[offset + 1] # Bottom if py < shape[1] - 1: workingBlock[px + 1, py + 1, pz] += vals[offset + 2] # Bottom right corner def blurMaps(self): """Blur the 2D image maps""" self.blurredMaps = np_zeros((self.numImgMaps, self.PM.scaleFactor, self.PM.scaleFactor)) for i in range(self.numImgMaps): # top, front and side self.blurredMaps[i, :, :] = ndi.gaussian_filter(self.imageMaps[i, :, :], 8) # self.blurRadius) def makeCoordRanges(self, pos, span): """Make search ranges which won't go out of bounds""" lower = pos - span upper = pos + span + 1 if lower < 0: lower = 0 if upper > self.PM.scaleFactor: upper = self.PM.scaleFactor return (lower, upper) def updatePostBin(self, bin): """Update data structures after assigning contigs to a new bin""" for row_index in bin.rowIndices: self.setRowIndexAssigned(row_index) def setRowIndexAssigned(self, rowIndex): """fix the data structures to indicate that rowIndex belongs to a bin Use only during initial core creation """ if rowIndex not in self.PM.restrictedRowIndicies and rowIndex not in self.PM.binnedRowIndicies: self.PM.binnedRowIndicies[rowIndex] = True # now update the image map, decrement self.decrementViaRowIndex(rowIndex) def setRowIndexUnassigned(self, rowIndex): """fix the data structures to indicate that rowIndex no longer belongs to a bin Use only during initial core creation """ if rowIndex in self.PM.restrictedRowIndicies and rowIndex not in self.PM.binnedRowIndicies: del self.PM.binnedRowIndicies[rowIndex] # now update the image map, increment self.incrementViaRowIndex(rowIndex) def restrictRowIndicies(self, indices): """Add these indices to the restricted list""" for row_index in indices: # check that it's not binned or already restricted if row_index not in self.PM.restrictedRowIndicies and row_index not in self.PM.binnedRowIndicies: self.PM.restrictedRowIndicies[row_index] = True # now update the image map, decrement self.decrementViaRowIndex(row_index) # ------------------------------------------------------------------------------ # IO and IMAGE RENDERING def plotRegion(self, px, py, pz, fileName="", tag="", column=False): """Plot the region surrounding a point """ import matplotlib as mpl disp_vals = np_array([]) disp_cols = np_array([]) num_points = 0 # plot all points within span (z_lower, z_upper) = self.makeCoordRanges(pz, self.span) if column: z_lower = 0 z_upper = self.PM.scaleFactor - 1 (x_lower, x_upper) = self.makeCoordRanges(px, self.span) (y_lower, y_upper) = self.makeCoordRanges(py, self.span) for z in range(z_lower, z_upper): realz = self.PM.scaleFactor - z - 1 for x in range(x_lower, x_upper): for y in range(y_lower, y_upper): if (x, y, realz) in self.im2RowIndicies: for row_index in self.im2RowIndicies[(x, y, realz)]: if ( row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies ): num_points += 1 disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index]) disp_cols = np_append(disp_cols, self.PM.contigColours[row_index]) # make a black mark at the max values small_span = self.span / 2 (x_lower, x_upper) = self.makeCoordRanges(px, small_span) (y_lower, y_upper) = self.makeCoordRanges(py, small_span) (z_lower, z_upper) = self.makeCoordRanges(pz, small_span) for z in range(z_lower, z_upper): realz = self.PM.scaleFactor - z - 1 for x in range(x_lower, x_upper): for y in range(y_lower, y_upper): if (x, y, realz) in self.im2RowIndicies: for row_index in self.im2RowIndicies[(x, y, realz)]: if ( row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies ): num_points += 1 disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index]) disp_cols = np_append(disp_cols, htr(0, 0, 0)) # reshape disp_vals = np_reshape(disp_vals, (num_points, 3)) disp_cols = np_reshape(disp_cols, (num_points, 3)) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") cm = mpl.colors.LinearSegmentedColormap("my_colormap", disp_cols, 1024) result = ax.scatter( disp_vals[:, 0], disp_vals[:, 1], disp_vals[:, 2], edgecolors=disp_cols, c=disp_cols, cmap=cm, marker="." ) title = str.join(" ", ["Focus at: (", str(px), str(py), str(self.PM.scaleFactor - pz - 1), ")\n", tag]) plt.title(title) if fileName != "": fig.set_size_inches(6, 6) plt.savefig(fileName, dpi=300) elif show: plt.show() plt.close(fig) del fig def plotHeat(self, fileName="", max=-1, x=-1, y=-1): """Print the main heat maps Useful for debugging """ fig = plt.figure() images = [] ax = None if self.numImgMaps == 1: ax = fig.add_subplot(121) images.append(ax.imshow(self.blurredMaps[0, :, :] ** 0.5)) if max > 0: title = "Max value: %f (%f, %f)" % (max, x, y) plt.title(title) else: ax = fig.add_subplot(231) images.append(ax.imshow(self.blurredMaps[0, :, :] ** 0.5)) if max > 0: title = str.join(" ", ["Max value:", str(max)]) plt.title(title) ax = fig.add_subplot(232) images.append(ax.imshow(self.blurredMaps[1, :, :] ** 0.5)) ax = fig.add_subplot(233) images.append(ax.imshow(self.blurredMaps[2, :, :] ** 0.5)) if self.numImgMaps == 1: ax = fig.add_subplot(122) images.append(ax.imshow(self.imageMaps[0, :, :] ** 0.5)) else: ax = fig.add_subplot(234) images.append(ax.imshow(self.imageMaps[0, :, :] ** 0.5)) ax = fig.add_subplot(235) images.append(ax.imshow(self.imageMaps[1, :, :] ** 0.5)) ax = fig.add_subplot(236) images.append(ax.imshow(self.imageMaps[2, :, :] ** 0.5)) if fileName != "": if self.numImgMaps == 1: fig.set_size_inches(12, 6) else: fig.set_size_inches(18, 18) plt.savefig(fileName, dpi=300) elif show: plt.show() plt.close(fig) del fig
class BinExtractor: """Used for extracting reads and contigs based on bin assignments""" def __init__(self, dbFileName, folder='', ): self.dbFileName = dbFileName self._pm = ProfileManager(self.dbFileName) self._outDir = os.getcwd() if folder == "" else folder # make the dir if need be makeSurePathExists(self._outDir) def loadProfile(self, timer, bids=[], cutoff=0): removeBins = bids is None or bids == [] return self._pm.loadData(timer, loadMarkers=False, loadBins=True, bids=[0] if removeBins else bids, removeBins=removeBins, minLength=cutoff ) def extractContigs(self, timer, bids=[], fasta=[], prefix='', cutoff=0): """Extract contigs and write to file""" if prefix is None or prefix == '': prefix=os.path.basename(self.dbFileName) \ .replace(".gm", "") \ .replace(".sm", "") profile = self.loadProfile(timer, bids, cutoff) bm = BinManager(profile) # load all the contigs which have been assigned to bins cp = ContigParser() # contigs looks like cid->seq contigs = {} import mimetypes try: for file_name in fasta: gm_open = open try: # handle gzipped files mime = mimetypes.guess_type(file_name) if mime[1] == 'gzip': import gzip gm_open = gzip.open except: print "Error when guessing contig file mimetype" raise with gm_open(file_name, "r") as f: cp.getWantedSeqs(f, profile.contigNames, out_dict=contigs) except: print "Could not parse contig file:",fasta[0],sys.exc_info()[0] raise # now print out the sequences print "Writing files" for bid in bm.getBids(): file_name = os.path.join(self._outDir, "%s_bin_%d.fna" % (prefix, bid)) try: with open(file_name, 'w') as f: for cid in bm.profile.contigNames[bm.getBinIndices(bid)]: if(cid in contigs): f.write(">%s\n%s\n" % (cid, contigs[cid])) else: print "These are not the contigs you're looking for. ( %s )" % (cid) except: print "Could not open file for writing:",file_name,sys.exc_info()[0] raise def extractReads(self, timer, bids=[], bams=[], prefix="", mixBams=False, mixGroups=False, mixReads=False, interleaved=False, bigFile=False, headersOnly=False, minMapQual=0, maxMisMatches=1000, useSuppAlignments=False, useSecondaryAlignments=False, threads=1, verbose=False): """Extract reads from bam files and write to file All logic is handled by BamM <- soon to be wrapped by StoreM""" # load data profile = self.loadProfile(timer, bids) bm = BinManager(profile) # bins print "Extracting reads" # work out a set of targets to pass to the parser targets = [] group_names = [] for bid in bm.getBids(): group_names.append("BIN_%d" % bid) row_indices = bm.getBinIndices(bid) targets.append(list(bm.profile.contigNames[row_indices])) # get something to parse the bams with bam_parser = BMBE(targets, bams, groupNames=group_names, prefix=prefix, outFolder=self._outDir, mixBams=mixBams, mixGroups=mixGroups, mixReads=mixReads, interleaved=interleaved, bigFile=bigFile, headersOnly=headersOnly, minMapQual=minMapQual, maxMisMatches=maxMisMatches, useSuppAlignments=useSuppAlignments, useSecondaryAlignments=useSecondaryAlignments) bam_parser.extract(threads=threads, verbose=verbose)
class BinStatsDumper: def __init__(self, dbFileName): self.dbFileName = dbFileName self._pm = ProfileManager(self.dbFileName) def loadProfile(self, timer): return self._pm.loadData(timer, loadMarkers=True, loadBins=True, bids=[0], removeBins=True, ) def dumpBinStats(self, timer, fields, outFile, separator, useHeaders ): """Compute bin statistics""" # load all the contigs which have been assigned to bins profile = self.loadProfile(timer) bm = BinManager(profile) stats = bm.getBinStats() #data to output header_strings = [] data_arrays = [] data_converters = [] for field in fields: if field == 'bins': header_strings.append('bid') data_arrays.append(stats.bids) data_converters.append(lambda x: str(x)) elif field == 'points': header_strings.append('num_contigs') data_arrays.append(stats.numContigs) data_converters.append(lambda x: str(x)) elif field == 'sizes': header_strings.append('size') data_arrays.append(stats.sizes) data_converters.append(lambda x: str(x)) elif field == 'lengths': header_strings.append('length_min') header_strings.append('length_median') header_strings.append('length_max') data_arrays.append(stats.lengthRanges[:,0]) data_arrays.append(stats.lengthMedians) data_arrays.append(stats.lengthRanges[:,1]) data_converters.append(lambda x: str(x)) data_converters.append(lambda x: str(x)) data_converters.append(lambda x: str(x)) elif field == 'gc': header_strings.append("GC%_mean") header_strings.append("GC%_std") data_arrays.append(stats.GCMeans) data_arrays.append(stats.GCStdDevs) data_converters.append(lambda x : "%0.4f" % x) data_converters.append(lambda x : "%0.4f" % x) elif field == 'coverage': stoits = profile.stoitNames header_strings.append(separator.join([separator.join([i + "_mean", i + "_std"]) for i in stoits])) interleaved = np.dsplit(np.transpose(np.dstack((stats.covMeans, stats.covStdDevs)), axes=[0, 2, 1])) data_arrays.append(interleaved) data_converters.append(lambda x: separator.join(["%0.4f"+separator+"%0.4f" % i for i in x])) elif field == 'tags': header_strings.append('taxonomy') data_arrays.append(stats.tags) data_converters.append(lambda x: x) # now print out the sequences try: with open(outFile, 'w') as f: if useHeaders: header = separator.join(header_strings) + '\n' f.write(header) num_rows = len(data_arrays[0]) for i in range(num_rows): row = separator.join([conv(arr[i]) for (conv, arr) in zip(data_converters, data_arrays)]) f.write(row+'\n') except: print "Could not open file for writing:",outFile,sys.exc_info()[0] raise
class MarkerExtractor: def __init__(self, dbFileName, folder='' ): self.dbFileName = dbFileName self._pm = ProfileManager(self.dbFileName) self._outDir = os.getcwd() if folder == "" else folder # make the dir if need be makeSurePathExists(self._outDir) def loadProfile(self, timer, bids=[], cutoff=0): removeBins = bids is None or bids == [] return self._pm.loadData(timer, loadBins=True, loadMarkers=True, loadTaxstrings=True, loadReachability=True, minLength=cutoff, bids=[0] if removeBins else bids, removeBins=removeBins, ) def extractMappingInfo(self, timer, bids=[], prefix='', separator='\t', cutoff=0 ): """Extract markers from bins and write to file""" if prefix is None or prefix == '': prefix=os.path.basename(self.dbFileName) \ .replace(".gm", "") \ .replace(".sm", "") profile = self.loadProfile(timer, bids, cutoff) bm = BinManager(profile) mt = MarkerCheckTreePrinter(profile) # now print out the marker info print "Writing files" for bid in bm.getBids(): file_name = os.path.join(self._outDir, "%s_bin_%d.txt" % (prefix, bid)) bin_indices = bm.getBinIndices([bid]) idx = np.flatnonzero(np.in1d(profile.mapping.rowIndices, bin_indices)) labels = profile.mapping.markerNames[idx] cnames = profile.contigNames[profile.mapping.rowIndices[idx]] taxstrings = profile.mapping.taxstrings[idx] try: with open(file_name, 'w') as f: #labels and lineages f.write('#info table\n%s\n' % separator.join(['label', 'taxonomy', 'contig_name'])) for (label, taxstring, cname) in zip(labels, taxstrings, cnames): f.write('%s\n' % separator.join([label, '\'%s\'' % taxstring, cname])) #marker tree f.write('\n#marker tree\n') f.write(mt.printTree(profile.mapping.rowIndices[idx], leaves_list=bin_indices)) except: print "Could not open file for writing:",file_name,sys.exc_info()[0] raise
class BinExtractor: """Used for extracting reads and contigs based on bin assignments""" def __init__( self, dbFileName, folder='', ): self.dbFileName = dbFileName self._pm = ProfileManager(self.dbFileName) self._outDir = os.getcwd() if folder == "" else folder # make the dir if need be makeSurePathExists(self._outDir) def loadProfile(self, timer, bids=[], cutoff=0): removeBins = bids is None or bids == [] return self._pm.loadData(timer, loadMarkers=False, loadBins=True, bids=[0] if removeBins else bids, removeBins=removeBins, minLength=cutoff) def extractContigs(self, timer, bids=[], fasta=[], prefix='', cutoff=0): """Extract contigs and write to file""" if prefix is None or prefix == '': prefix=os.path.basename(self.dbFileName) \ .replace(".gm", "") \ .replace(".sm", "") profile = self.loadProfile(timer, bids, cutoff) bm = BinManager(profile) # load all the contigs which have been assigned to bins cp = ContigParser() # contigs looks like cid->seq contigs = {} import mimetypes try: for file_name in fasta: gm_open = open try: # handle gzipped files mime = mimetypes.guess_type(file_name) if mime[1] == 'gzip': import gzip gm_open = gzip.open except: print "Error when guessing contig file mimetype" raise with gm_open(file_name, "r") as f: cp.getWantedSeqs(f, profile.contigNames, out_dict=contigs) except: print "Could not parse contig file:", fasta[0], sys.exc_info()[0] raise # now print out the sequences print "Writing files" for bid in bm.getBids(): file_name = os.path.join(self._outDir, "%s_bin_%d.fna" % (prefix, bid)) try: with open(file_name, 'w') as f: for cid in bm.profile.contigNames[bm.getBinIndices(bid)]: if (cid in contigs): f.write(">%s\n%s\n" % (cid, contigs[cid])) else: print "These are not the contigs you're looking for. ( %s )" % ( cid) except: print "Could not open file for writing:", file_name, sys.exc_info( )[0] raise def extractReads(self, timer, bids=[], bams=[], prefix="", mixBams=False, mixGroups=False, mixReads=False, interleaved=False, bigFile=False, headersOnly=False, minMapQual=0, maxMisMatches=1000, useSuppAlignments=False, useSecondaryAlignments=False, threads=1, verbose=False): """Extract reads from bam files and write to file All logic is handled by BamM <- soon to be wrapped by StoreM""" # load data profile = self.loadProfile(timer, bids) bm = BinManager(profile) # bins print "Extracting reads" # work out a set of targets to pass to the parser targets = [] group_names = [] for bid in bm.getBids(): group_names.append("BIN_%d" % bid) row_indices = bm.getBinIndices(bid) targets.append(list(bm.profile.contigNames[row_indices])) # get something to parse the bams with bam_parser = BMBE(targets, bams, groupNames=group_names, prefix=prefix, outFolder=self._outDir, mixBams=mixBams, mixGroups=mixGroups, mixReads=mixReads, interleaved=interleaved, bigFile=bigFile, headersOnly=headersOnly, minMapQual=minMapQual, maxMisMatches=maxMisMatches, useSuppAlignments=useSuppAlignments, useSecondaryAlignments=useSecondaryAlignments) bam_parser.extract(threads=threads, verbose=verbose)
class BinStatsDumper: def __init__(self, dbFileName): self.dbFileName = dbFileName self._pm = ProfileManager(self.dbFileName) def loadProfile(self, timer): return self._pm.loadData( timer, loadMarkers=True, loadBins=True, bids=[0], removeBins=True, ) def dumpBinStats(self, timer, fields, outFile, separator, useHeaders): """Compute bin statistics""" # load all the contigs which have been assigned to bins profile = self.loadProfile(timer) bm = BinManager(profile) stats = bm.getBinStats() #data to output header_strings = [] data_arrays = [] data_converters = [] for field in fields: if field == 'bins': header_strings.append('bid') data_arrays.append(stats.bids) data_converters.append(lambda x: str(x)) elif field == 'points': header_strings.append('num_contigs') data_arrays.append(stats.numContigs) data_converters.append(lambda x: str(x)) elif field == 'sizes': header_strings.append('size') data_arrays.append(stats.sizes) data_converters.append(lambda x: str(x)) elif field == 'lengths': header_strings.append('length_min') header_strings.append('length_median') header_strings.append('length_max') data_arrays.append(stats.lengthRanges[:, 0]) data_arrays.append(stats.lengthMedians) data_arrays.append(stats.lengthRanges[:, 1]) data_converters.append(lambda x: str(x)) data_converters.append(lambda x: str(x)) data_converters.append(lambda x: str(x)) elif field == 'gc': header_strings.append("GC%_mean") header_strings.append("GC%_std") data_arrays.append(stats.GCMeans) data_arrays.append(stats.GCStdDevs) data_converters.append(lambda x: "%0.4f" % x) data_converters.append(lambda x: "%0.4f" % x) elif field == 'coverage': stoits = profile.stoitNames header_strings.append( separator.join([ separator.join([i + "_mean", i + "_std"]) for i in stoits ])) interleaved = np.dsplit( np.transpose(np.dstack((stats.covMeans, stats.covStdDevs)), axes=[0, 2, 1])) data_arrays.append(interleaved) data_converters.append(lambda x: separator.join( ["%0.4f" + separator + "%0.4f" % i for i in x])) elif field == 'tags': header_strings.append('taxonomy') data_arrays.append(stats.tags) data_converters.append(lambda x: x) # now print out the sequences try: with open(outFile, 'w') as f: if useHeaders: header = separator.join(header_strings) + '\n' f.write(header) num_rows = len(data_arrays[0]) for i in range(num_rows): row = separator.join([ conv(arr[i]) for (conv, arr) in zip(data_converters, data_arrays) ]) f.write(row + '\n') except: print "Could not open file for writing:", outFile, sys.exc_info( )[0] raise
class MarkerExtractor: def __init__(self, dbFileName, folder=''): self.dbFileName = dbFileName self._pm = ProfileManager(self.dbFileName) self._outDir = os.getcwd() if folder == "" else folder # make the dir if need be makeSurePathExists(self._outDir) def loadProfile(self, timer, bids=[], cutoff=0): removeBins = bids is None or bids == [] return self._pm.loadData( timer, loadBins=True, loadMarkers=True, loadTaxstrings=True, loadReachability=True, minLength=cutoff, bids=[0] if removeBins else bids, removeBins=removeBins, ) def extractMappingInfo(self, timer, bids=[], prefix='', separator='\t', cutoff=0): """Extract markers from bins and write to file""" if prefix is None or prefix == '': prefix=os.path.basename(self.dbFileName) \ .replace(".gm", "") \ .replace(".sm", "") profile = self.loadProfile(timer, bids, cutoff) bm = BinManager(profile) mt = MarkerCheckTreePrinter(profile) # now print out the marker info print "Writing files" for bid in bm.getBids(): file_name = os.path.join(self._outDir, "%s_bin_%d.txt" % (prefix, bid)) bin_indices = bm.getBinIndices([bid]) idx = np.flatnonzero( np.in1d(profile.mapping.rowIndices, bin_indices)) labels = profile.mapping.markerNames[idx] cnames = profile.contigNames[profile.mapping.rowIndices[idx]] taxstrings = profile.mapping.taxstrings[idx] try: with open(file_name, 'w') as f: #labels and lineages f.write( '#info table\n%s\n' % separator.join(['label', 'taxonomy', 'contig_name'])) for (label, taxstring, cname) in zip(labels, taxstrings, cnames): f.write('%s\n' % separator.join( [label, '\'%s\'' % taxstring, cname])) #marker tree f.write('\n#marker tree\n') f.write( mt.printTree(profile.mapping.rowIndices[idx], leaves_list=bin_indices)) except: print "Could not open file for writing:", file_name, sys.exc_info( )[0] raise