def averageContacts(contactIterator, inValues, N, **kwargs): """ Args: contactIterator: an iterator. See descriptions of "filenameContactMap" class below for example and explanations inValues: an array of values to pass to contactIterator. Would be an array of arrays of filenames or something like that. N: Size of the resulting contactmap **kwargs: arrayDtype: ctypes dtype (default c_int32) for the contact map classInitArgs: args to pass to the constructor of contact iterator as second+ args (first is the file list) classInitKwargs: dict of keyword args to pass to the coonstructor contactProcessing: function f(contacts), should return processed contacts nproc : int, number of processors(default 4) bucketNum: int (default = nproc) Number of memory bukcets to use contactBlock: int (default 500k) Number of contacts to aggregate before writing to memory """ arrayDtype = kwargs.get("arrayDtype", ctypes.c_int32) nproc = min(kwargs.get("nproc", 4), len(inValues)) bucketNum = kwargs.get("bucketNum", nproc) if nproc == 1: return averageContactsSimple(contactIterator, inValues, N, **kwargs) contactBlock = kwargs.get("contactBlock", 5000000) useFmap = kwargs.get("useFmap", False) classInitArgs = kwargs.get("classInitArgs", []) classInitKwargs = kwargs.get("classInitKwargs", {}) contactProcessing = kwargs.get("contactProcessing", lambda x: x) finalSize = N * (N + 1) // 2 boundaries = np.linspace(0, finalSize, bucketNum + 1, dtype=int) chunks = zip(boundaries[:-1], boundaries[1:]) sharedArrays = [mp.Array(arrayDtype, int(j - i)) for i, j in chunks] argset = list(sharedArrays) + [ contactProcessing, classInitArgs, classInitKwargs, contactIterator, contactBlock, N ] if not useFmap: with closing( mp.Pool(processes=nproc, initializer=init, initargs=argset)) as p: p.map(worker, inValues) else: init(*argset) from mirnylib.systemutils import fmap fmap(worker, inValues, nproc=nproc) sharedNumpy = list(map(tonumpyarray, sharedArrays)) res = np.concatenate(sharedNumpy) final = triagToNormal(res, N) return final
def makeMoviePymol(fileList, destFolder, fps=15, aviFilename='output.avi', pymolScript=""): if False in [os.path.exists(i) for i in fileList]: raise IOError("Some files are not in filelist") numFrames = len(fileList) numDigits = int(np.ceil(np.log10(numFrames))) destFolder = os.path.abspath(destFolder) pdbFolder = destFolder + '/pdb' imgFolder = destFolder + '/img' if os.path.exists(imgFolder): shutil.rmtree(imgFolder) for folder in [destFolder, pdbFolder, imgFolder]: if not os.path.isdir(folder): os.mkdir(folder) def saveToPdb(input): i, dataPath = input d = polymerutils.load(dataPath) pdbFilename = '{0:0{width}}.pdb'.format(i, width=numDigits) savePath = pdbFolder + '/' + pdbFilename polymerutils.save(d, savePath, mode='pdb', pdbGroups=colorArray) return os.path.abspath(savePath) pdbPaths = fmap(saveToPdb, enumerate(fileList)) script = 'hide all\n' for i in pdbPaths: script += 'load {0}, mov\n'.format(i) script += textwrap.dedent(""" smooth mov """) script += pymolScript script += "\n" script += textwrap.dedent(""" zoom mov """) tmpScriptPath = os.path.abspath(destFolder + '/movie.pymol') tmpScript = open(tmpScriptPath, 'w') tmpScript.write(script) tmpScript.flush() tmpScript.close() os.system("cd {0}; pymol -u {1}; cd -".format(imgFolder, tmpScriptPath)) _mencoder(imgFolder, fps, aviFilename)
def displayHeatmap(): plt.figure(figsize=(5, 5)) shared_arr = mp.Array(ctypes.c_double, N**2) arr = tonumpyarray(shared_arr) arr.shape = (N, N) def doSim(i): nparr = tonumpyarray(shared_arr) SMCTran = initModel(i) for j in range(1): SMC = [] N1 = 10000 for k in range(np.random.randint(N1 // 2, N1)): SMCTran.steps(150) SMC.append(SMCTran.getSMCs()) SMC = np.concatenate(SMC, axis=1) SMC1D = SMC[0] * N + SMC[1] position, counts = np.unique(SMC1D, return_counts=True) with shared_arr.get_lock(): nparr[position] += counts print("Finished!") return None setExceptionHook() low20 = low // 10 high20 = high // 10 mydict = h5dict( "/home/magus/HiC2011/Erez2014/hg19/GM12878_inSitu-all-combined-10k_HighRes.byChr", 'r') hicdata = mydict.get_dataset("13 13")[low20:high20, low20:high20] hicdata = completeIC(hicdata) curshape = hicdata.shape newshape = (1000 * (high - low)) // (600 * 20) print(hicdata.shape, newshape) hicdata = zoomArray(hicdata, (newshape, newshape)) hicdata = np.clip(hicdata, 0, np.percentile(hicdata, 99.99)) hicdata /= np.mean(np.sum(hicdata, axis=1)) fmap(doSim, range(30), n=20) # number of threads to use. On a 20-core machine I use 20. arr = coarsegrain(arr, 20) arr = np.clip(arr, 0, np.percentile(arr, 99.9)) arr /= np.mean(np.sum(arr, axis=1)) ran = np.arange(len(arr)) mask = ran[:, None] > ran[None, :] arr[mask] = hicdata[mask] logarr = np.log(arr + 0.0001) plt.imshow(logarr, vmax=np.percentile(logarr, 99.9), extent=[low, high, high, low], interpolation="none") nicePlot()
def averageContacts(contactIterator, inValues, N, **kwargs): """ A main workhorse for averaging contacts on multiple cores into one shared contact map. It mostly does managing the arguments, and initializing the variables. All of the logic of how contacts are actually put in shared memory buckets is in the worker defined above. PARAMETERS ---------- contactIterator : iterator an iterator. See descriptions of "filenameContactMap" class below for example and explanations inValues : iterable an array of values to pass to contactIterator. Would be an array of arrays of filenames or something like that. N : int Size of one side of the resulting contactmap arrayDtype : ctypes dtype (default c_int32) for the contact map classInitArgs : args to pass to the constructor of contact iterator classInitKwargs: dict of keyword args to pass to the constructor contactProcessing: function f(contacts), should return processed contacts nproc : int, number of processors(default 4) bucketNum: int (default = nproc) Number of memory buckets to use contactBlock: int (default 500k) Number of contacts to aggregate before writing useFmap : True, False, or callable If True, uses mirnylib.systemutils.fmap If False, uses multiprocessing.Pool.map Otherwise, uses provided function, assuming it of a fork-map type (different initializations are needed for forkmap and multiprocessing-style map) Sorry, no outside multiprocessing-style maps for now, it's easy to fix Let me know if it is needed. Code that calcualtes a contactmap from a set of polymer conformation is in the methods below (averageMonomerResolutionContactMap, etc.) An example code that would run a contactmap from a simulation is pasted below. class simContactMap(object): "contactmap 'finder' for a simulation" def __init__(self, ind): # accept a parameter (e.g. random number generator seed) self.model = initModel(ind) # pass parameter to the functon that returns me a model object self.count = 10000000 # how many times to run a step of the model self.model.steps(10000) # initial steps of the model to equilibrate it def next(self): # actual realization of the self.next method if self.count == 0: # terminate the simulation if we did self.count iterations raise StopIteration self.count -= 1 #decrement the counter self.model.steps(30) # advance model by 30 steps return np.array(self.model.getSMCs()).T # return current LEF positions mymap = polychrom.contactmaps.averageContacts(simContactMap, range(20), 30000, nproc=20 ) """ arrayDtype = kwargs.get("arrayDtype", ctypes.c_int32) nproc = min(kwargs.get("nproc", 4), len(inValues)) bucketNum = kwargs.get("bucketNum", nproc) if nproc == 1: return averageContactsSimple(contactIterator, inValues, N, **kwargs) contactBlock = kwargs.get("contactBlock", 5000000) useFmap = kwargs.get("useFmap", False) classInitArgs = kwargs.get("classInitArgs", []) classInitKwargs = kwargs.get("classInitKwargs", {}) contactProcessing = kwargs.get("contactProcessing", lambda x: x) finalSize = N * (N + 1) // 2 boundaries = np.linspace(0, finalSize, bucketNum + 1, dtype=int) chunks = zip(boundaries[:-1], boundaries[1:]) sharedArrays = [mp.Array(arrayDtype, int(j - i)) for i, j in chunks] argset = list(sharedArrays) + [ contactProcessing, classInitArgs, classInitKwargs, contactIterator, contactBlock, N, ] if ( not useFmap ): # for mp.map we need initializer because shared memory cannot be pickled # # or passed as an argument in inValues with closing( mp.Pool(processes=nproc, initializer=init, initargs=argset)) as p: p.map(worker, inValues) # diffent strategy for a local map # shared memory is just a global variable created by init() else: init(*argset) # creating global variables here if callable(useFmap): fmap = useFmap else: from mirnylib.systemutils import fmap fmap(worker, inValues, nproc=nproc) res = np.concatenate([tonumpyarray(i) for i in sharedArrays]) del sharedArrays # save memory final = triagToNormal(res, N) return final
def give_slices(base, tosave, slices, sliceParams, multipliers, mode="chain", loadFunction=load, integrate=False, normalize=False, exceptionList=[], nproc=4, cutoff=1.7, binstep=1.15, integerSlices=True, verbose=False): np.seterr(invalid='raise') plotsBySlice = [] for cur_slice in slices: files = [] def slice2D(a, b, mult=[1]): tm = [] if type(b) == tuple: for i in range(b[0], b[1] + 1): tm.append((i, a)) elif type(b) == int: for i in range(1, b + 1): tm.append((i, a)) elif type(b) == list: tm = [(i, a) for i in b] if integerSlices: tm2 = sorted( list( set([(i[0], int(float(i[1]) * m)) for i in tm for m in mult]))) else: tm2 = sorted(tm) print(tm2) return tm2 def slice3D(a, b, c, mult=[1]): tm = [] for i in range(b[0], b[1] + 1): for t in range(c[0], c[1] + 1): tm.append((i, a, t)) tm2 = sorted( list( set([(i[0], int(float(i[1]) * m)) for i in tm for m in mult]))) print(tm2) return tm2 # sluces actually are defined runs = slice2D(cur_slice, sliceParams, multipliers) # runs = slice3D(cur_slice, (1,14),(1,10),multipliers) for i in runs: # filename is replaced in slices try: files.append( base.replace("DATA1", str(i[0])).replace( "DATA2", str(i[1])).replace("DATA3", str(i[2]))) except: files.append( base.replace("DATA1", str(i[0])).replace("DATA2", str(i[1]))) datas = [] def newload(i): # loads a file try: data = loadFunction(i, False) if len(data) != 3: data = data.T if len(data) != 3: raise Exception("Wrong shape of data") data = np.asarray(data, order="C", dtype=float) return data except tuple(exceptionList): print("file not found", i) return None # use this for determining the file size datas = [ x for x in fmap(newload, files[::len(files) // 20 + 1], n=3) if x is not None ] datlen = len(datas[0][0]) if mode == "chain": bins2 = logbins(4, datlen - 100, binstep) if mode == "parts": bins2 = logbins(4, datlen - 100, binstep) if (mode == "ring") or (mode == "intring"): b1 = logbins(2, datlen // 4 - 1, binstep) bins2 = [2 * i for i in b1] print(bins2) binsrg = logbins(4, datlen - 100, binstep) def give_plots(i): data = newload(i) if data is None: return None i = data if (mode == "ring") or (mode == "intring"): b = give_radius_scaling(i, binsrg, ring=True) else: b = give_radius_scaling(i, binsrg, ring=False) if (mode == "chain"): a = giveCpScaling(i, bins2, cutoff, integrate, verbose=verbose) if (mode == "ring"): a = giveCpScaling(i, bins2, cutoff, integrate, ring=True, verbose=verbose) if (mode == "intring"): a = giveCpScaling(i, bins2, cutoff, integrate, ring=True, project=False, intContacts=True, verbose=verbose) if (mode == "project"): a = giveCpScaling(i, bins2, 1.450, integrate, project=True, verbose=verbose) if (mode == "ring") or (mode == "intring"): c = give_distance(i, bins2, ring=True) else: c = give_distance(i, bins2, ring=False) if (normalize == True): a = np.array(a) pos = a[0] values = a[1] bins = np.r_[1.5 * pos[0] - 0.5 * pos[1], 0.5 * (pos[1:] + pos[:-1]), pos[-1]] lens = bins[1:] - bins[:-1] ints = np.cumsum(lens * values) values /= ints[-1] ints /= ints[-1] a = [pos, values] a = np.array(a, dtype=float) b = np.array(b, dtype=float) c = np.array(c, dtype=float) return np.array([a, b, c]) random.shuffle(files) parPlots = fmap(give_plots, files, n=nproc) parPlots = [x for x in parPlots if x is not None] means = np.mean(parPlots, axis=0) plotsBySlice.append([means, {"slice": cur_slice}]) if tosave is not None: pickle.dump(plotsBySlice, open(tosave, 'wb'), -1) print("Finished!!!") return plotsBySlice
def averagePureContactMap( filenames, cutoff=1.7, n=4, # Num threads loadFunction=load, exceptionsToIgnore=[], printProbability=0.005): """ Parameters ---------- filenames : list of strings Filenames to average map over cutoff : float, optional Cutoff to calculate contacts n : int, optional Number of threads to use. By default 4 to minimize RAM consumption with pure maps. exceptionsToIgnore : list of Exceptions List of exceptions to ignore when finding the contact map. Put IOError there if you want it to ignore missing files. Returns ------- An NxN (for pure map) numpy array with the contact map. """ """ Now we actually need to modify our contact map by adding contacts from each new file to the contact map. We do it this way because our contact map is huge (maybe a gigabyte!), so we can't just add many gigabyte-sized arrays together. Instead of this each worker creates an empty "average contact map", and then loads files one by one and adds contacts from each file to a contact map. Maps from different workers are then added together manually. """ n = min(n, len(filenames)) subvalues = [filenames[i::n] for i in range(n)] def myaction(values): # our worker receives some filenames mysum = None # future contact map. for i in values: try: data = loadFunction(i) if np.random.random() < printProbability: print(i) except tuple(exceptionsToIgnore): print("file not found", i) continue except: print("Unexpected error:", sys.exc_info()[0]) print("File is: ", i) return -1 if data.shape[0] == 3: data = data.T if mysum is None: # if it's the first filename, if len(data) > 6000: warnings.warn( UserWarning( 'very large contact map' ' may cause errors. these may be fixed with n=1 threads.' )) if len(data) > 20000: warnings.warn( UserWarning('very large contact map' ' may be difficult to visualize.')) mysum = pureMap(data, cutoff) # create a map else: # if not pureMap(data, cutoff, mysum) # use existing map and fill in contacts return mysum blocks = fmap(myaction, subvalues) blocks = [i for i in blocks if i is not None] a = blocks[0] for i in blocks[1:]: a = a + i return a
def averageBinnedContactMap( filenames, chains=None, binSize=None, cutoff=1.7, n=4, # Num threads loadFunction=load, exceptionsToIgnore=None, printProbability=1): """ Returns an average contact map of a set of conformations. Non-existing files are ignored if exceptionsToIgnore is set to IOError. example:\n An example: .. code-block:: python >>> filenames = ["myfolder/blockd%d.dat" % i for i in xrange(1000)] >>> cmap = averageBinnedContactMap(filenames) + 1 #getting cmap #either showing a log of a map (+1 for zeros) >>> plt.imshow(numpy.log(cmap +1)) #or truncating a map >>> vmax = np.percentile(cmap, 99.9) >>> plt.imshow(cmap, vmax=vmax) >>> plt.show() Parameters ---------- filenames : list of strings Filenames to average map over chains : list of tuples or Nx2 array (start,end+1) of each chain binSize : int size of each bin in monomers cutoff : float, optional Cutoff to calculate contacts n : int, optional Number of threads to use. By default 4 to minimize RAM consumption. exceptionsToIgnore : list of Exceptions List of exceptions to ignore when finding the contact map. Put IOError there if you want it to ignore missing files. Returns ------- tuple of two values: (i) MxM numpy array with the conntact map binned to binSize resolution. (ii) chromosomeStarts a list of start sites for binned map. """ n = min(n, len(filenames)) subvalues = [filenames[i::n] for i in range(n)] getResolution = 0 fileInd = 0 while getResolution == 0: try: data = loadFunction(filenames[fileInd]) # load filename getResolution = 1 except: fileInd = fileInd + 1 if fileInd >= len(filenames): print("no valid files found in filenames") raise ValueError("no valid files found in filenames") if chains is None: chains = [[0, len(data)]] if binSize is None: binSize = int(np.floor(len(data) / 500)) bins = [] chains = np.asarray(chains) chainBinNums = (np.ceil((chains[:, 1] - chains[:, 0]) / (0.0 + binSize))) for i in range(len(chainBinNums)): bins.append(binSize * (np.arange(int(chainBinNums[i]))) + chains[i, 0]) bins.append(np.array([chains[-1, 1] + 1])) bins = np.concatenate(bins) bins = bins - .5 Nbase = len(bins) - 1 if Nbase > 10000: warnings.warn( UserWarning('very large contact map' ' may be difficult to visualize')) chromosomeStarts = np.cumsum(chainBinNums) chromosomeStarts = np.hstack((0, chromosomeStarts)) def myaction(values): # our worker receives some filenames mysum = None # future contact map. for i in values: try: data = loadFunction(i) if np.random.random() < printProbability: print(i) except tuple(exceptionsToIgnore): print("file not found", i) continue if data.shape[0] == 3: data = data.T if mysum is None: # if it's the first filename, mysum = rescaledMap(data, bins, cutoff) # create a map else: # if not rescaledMap(data, bins, cutoff, mysum) # use existing map and fill in contacts return mysum blocks = fmap(myaction, subvalues) blocks = [i for i in blocks if i is not None] a = blocks[0] for i in blocks[1:]: a = a + i a = a + a.T return a, chromosomeStarts
def makeMovie(fileList, imgFolder, fps=20, aviFilename='output.avi'): offset = 2 if not fileList: return numFrames = len(fileList) numDigits = int(np.ceil(np.log10(numFrames))) def smallFunction(x): i, dataPath = x savePath = imgFolder + '/{0:0{width}}.png'.format(i, width=numDigits) coreParticles = cPickle.load( open(os.path.join(os.path.split(dataPath)[0], "coreParticles"))) data = load(dataPath) #data = load("../globules_expanded/crumpled1.dat_expanded") colorArray = np.zeros(len(data), int) for j, i in enumerate(coreParticles): if (j < len(coreParticles) - 1) and (j % 20 == 1): colorArray[max(i, 0):min(coreParticles[j + 1] - 1, len(data))] = 2 colorArray[max(i - offset, 0):min(i + offset + 1, len(data))] = 1 loopArray = np.zeros(len(data), int) coords = data regions1 = pymol_show.createRegions(colorArray == 1) M = len(regions1) print M colors1 = ["brown" for i in range(M)] transparencies1 = [0 for i in colors1] regions2 = pymol_show.createRegions(colorArray == 2) M = len(regions2) print M allColors = ["br{0}".format(i) for i in range(10)] S = len(allColors) colors2 = [ allColors[int(float(i) * float(S) / float(M))] for i in range(M) ] transparencies2 = [0 for i in colors2] pymol_show.do_coloring( coords, list(regions2), list(colors2), list(transparencies2), chainRadius=.25, subchainRadius=.35, chainTransparency=0.9, #returnScriptName="mov", showChain="worm", pdbGroups=colorArray, showGui=True, #saveTo=savePath, multiplier=.8, support=""" create back, chain 1 set cartoon_transparency,0.000000,back set cartoon_trace_atoms,1, back set cartoon_tube_radius,0.280000, back cartoon tube, back color brown, back set depth_cue, 0 set field_of_view, 10 set_view (\ 0.362947434, -0.752908945, 0.548998356,\ -0.843832374, -0.015661031, 0.536378920,\ -0.395252228, -0.657936990, -0.641013563,\ -0.000659317, 0.000253409, -788.783874512,\ 81.561027527, 81.701515198, 121.653610229,\ 615.844299316, 961.749450684, -10.000000000 ) png {savepath} quit """.format(savepath=savePath)) fmap(smallFunction, enumerate(fileList), n=8) _mencoder(imgFolder, fps, aviFilename)