def main(options): fafile = options.fasta outpick = fafile.replace(".fa", ".pickle") fastafile = SeqIO.parse(open(fafile, 'r'), 'fasta') r = dtm.map(get_folds_probabilities_single, fastafile, foldparams=(options.fp).replace("_", " ")) f = open(outpick, 'w') pickle.dump(r, file=f)
def main(): beginTime = time.time() countFailed = 0 list1 = range(500) _logger.info("[%s] DTM test suite started", time.time() - beginTime) _logger.info("[%s] Testing worker id generation : '%s'", time.time() - beginTime, dtm.getWorkerId()) _logger.info("[%s] Testing synchronous calls...", time.time() - beginTime) list1r = dtm.map(mapFunc1, list1) list1t = list(map(mapFunc1, list1)) if list1r != list1t: _logger.warning("[%s] DTM synchronous map test FAILED!", time.time() - beginTime) countFailed += 1 else: _logger.info("[%s] DTM synchronous map test successful", time.time() - beginTime) applyTestr = dtm.apply(applyFunc1, "0123456789") if applyTestr != 10: _logger.warning("[%s] DTM synchronous apply test FAILED!", time.time() - beginTime) countFailed += 1 else: _logger.info("[%s] DTM synchronous apply test successful", time.time() - beginTime) repeatTestr = dtm.repeat(applyFunc1, 20, "0123456789") repeatTestt = [10 for i in range(20)] if repeatTestr != repeatTestt: _logger.warning("[%s] DTM synchronous repeat test FAILED!", time.time() - beginTime) countFailed += 1 else: _logger.info("[%s] DTM synchronous repeat test successful", time.time() - beginTime) filterTestr = dtm.filter(filterFunc1, list1) filterTestt = list(filter(filterFunc1, list1)) if filterTestr != filterTestt: _logger.warning("[%s] DTM synchronous filter test FAILED!", time.time() - beginTime) countFailed += 1 else: _logger.info("[%s] DTM synchronous filter test successful", time.time() - beginTime) ###################################################################################### _logger.info("[%s] Testing asynchronous calls...", time.time() - beginTime) timeInit = time.time() mapAsyncReq2 = dtm.map_async(mapFunc2, list1) if mapAsyncReq2.ready() and time.time() - timeInit < 0.1: _logger.warning( "[%s] DTM asynchronous map test seems to have failed by time!", time.time() - beginTime) mapAsyncReq1 = dtm.map_async(mapFunc1, list1) list2t = [arg * 3 for arg in list1] mapAsyncReq1.wait() mapAsyncReq2.wait() if list1t == mapAsyncReq1.get() and list2t == mapAsyncReq2.get(): _logger.info("[%s] DTM asynchronous map test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM asynchronous map test FAILED", time.time() - beginTime) countFailed += 1 timeInit = time.time() applyAsyncReq1 = dtm.apply_async(applyFunc2, 1) if applyAsyncReq1.ready() and time.time() - timeInit < 0.5: _logger.warning( "[%s] DTM asynchronous apply test seems to have failed by time!", time.time() - beginTime) applyAsyncReq2 = dtm.apply_async(applyFunc2, 2) applyAsyncReq3 = dtm.apply_async(applyFunc2, 3) applyAsyncReq1.wait() applyAsyncReq2.wait() applyAsyncReq3.wait() if applyAsyncReq1.get() == 3 and applyAsyncReq2.get( ) == 6 and applyAsyncReq3.get() == 9: _logger.info("[%s] DTM asynchronous apply test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM asynchronous apply test FAILED", time.time() - beginTime) countFailed += 1 imapObj = dtm.imap(mapFunc1, list1) list3r = [i for i in imapObj] if list3r == list1t: _logger.info("[%s] DTM (a)synchronous imap test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM (a)synchronous imap test FAILED", time.time() - beginTime) countFailed += 1 imapNotOrderedObj = dtm.imap_unordered(mapFunc1, list1, 50) list4r = [i for i in imapNotOrderedObj] list4r_sorted = list(sorted(list4r)) if list4r != list4r_sorted and list4r_sorted == list1t: _logger.info("[%s] DTM asynchronous imap_unordered test successful", time.time() - beginTime) elif list4r_sorted == list1t: _logger.info( "[%s] DTM asynchronous imap_unordered MAY have failed (same behavior as imap())", time.time() - beginTime) else: _logger.warning("[%s] DTM asynchronous imap_unordered test FAILED", time.time() - beginTime) countFailed += 1 ###################################################################################### _logger.info("[%s] Testing asynchronous interactions...", time.time() - beginTime) mapAsyncReq2 = dtm.map_async(mapFunc2, list1) mapAsyncReq1 = dtm.map_async(mapFunc1, list1) if dtm.testAll([mapAsyncReq2, mapAsyncReq1]): _logger.warning("[%s] DTM testAll() test FAILED", time.time() - beginTime) countFailed += 1 else: _logger.info("[%s] DTM testAll() test successful", time.time() - beginTime) dtm.waitAll() if mapAsyncReq1.ready() and mapAsyncReq2.ready(): _logger.info("[%s] DTM waitAll() test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM waitAll() test FAILED", time.time() - beginTime) countFailed += 1 applyAsyncReq1 = dtm.apply_async(applyFunc1, "0123456789") mapAsyncReq2 = dtm.map_async(mapFunc2, list1) retVal = dtm.waitAny() if retVal == applyAsyncReq1 and applyAsyncReq1.get( ) == 10 and mapAsyncReq2.ready() == False: _logger.info("[%s] DTM waitAny() test successful", time.time() - beginTime) elif retVal == mapAsyncReq2 and isinstance(mapAsyncReq2.get(), list): _logger.info("[%s] DTM waitAny() test PROBABLY successful but weird", time.time() - beginTime) else: _logger.warning("[%s] DTM waitAny() test FAILED", time.time() - beginTime) countFailed += 1 mapAsyncReq2.wait() if dtm.testAny() == mapAsyncReq2: _logger.info("[%s] DTM testAny() test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM testAny() test FAILED", time.time() - beginTime) ###################################################################################### _logger.info("[%s] Testing parameters and exceptions handling...", time.time() - beginTime) applyParamPassr = dtm.apply(applyFunc4, 1, 2, "abc", bb={ 'a': 2, 'b': 3, 'c': 4 }, cc=range(10), dd=13.37) applyParamPasst = applyFunc4(1, 2, "abc", bb={ 'a': 2, 'b': 3, 'c': 4 }, cc=range(10), dd=13.37) if applyParamPassr == applyParamPasst: _logger.info("[%s] DTM parameters passing test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM parameters passing test FAILED", time.time() - beginTime) countFailed += 1 try: applyExceptTestr = dtm.map(applyFunc3, [-2, -1, 0]) except ZeroDivisionError: _logger.info("[%s] DTM exception catch test successful", time.time() - beginTime) else: _logger.warning("[%s] DTM exception catch test FAILED", time.time() - beginTime) countFailed += 1 _logger.info("[%s] DTM test suite done with %i errors", time.time() - beginTime, countFailed) return 0
def write_all_trajectories(self, input_dir, output_dir, stride, max_rmsd, min_gens, center_conformations, num_proc, input_style, update=False): """ Convert all of the trajectories in the FAH project in input_dir to lh5 trajectory files which will be placed in output dir. If the 'update' flag is set, then will use the memory object to check for previously converted data, and add to it (rather than reconverting everything). This functionality can be more cleanly called through the update_trajectories() method. Parameters ---------- input_dir : str The directory to look for XTC/DCD files in. output_dir : str The place to write the converted lh5s stride : int The size of the stride to employ. E.g., if stride = 3, the script keeps every 3rd MD snapshot from the original data. Useful to throw away highly correlated data if snapshots were saved frequently. max_rmsd : float Throw away any data that is further than `max_rmsd` (in nm) from the pdb file associated with the project. This is used as a sanity check to prevent including, e.g. data from a simulation that is blowing up. min_gens : int Discard trajectories with fewer than `min_gens` generations. center_conformations : bool Whether to center the converted (lh5) conformations. num_proc : int Number of processors to employ. Note that this function is typically I/O limited, so paralellism is unlikely to yield much gain. input_style : {'FAH', 'FILE'} If you use input_style = 'FAH', this code uses knowledge of the RUN*/CLONE* directory structure to yield all the CLONE directories. If you use input_style = 'FILE', this code uses os.walk() which is A LOT slower because it has to stat every file, but is capable of recursively searching for xtc files to arbitrary depths. update : bool If `True`, then tries to figure out what data has already been converted by reading the "memory state" in the provided ProjectInfo file, and only converts new data. If `False`, does a fresh re-convert. Notes ----- Since sometimes a conversion fails, we collect all trajectories at the end and renumber them such that they are contiguously numbered. """ if update: assert os.path.exists(output_dir) else: try: os.mkdir(output_dir) except OSError: logger.error('Error: The directory %s already exists', output_dir) sys.exit(1) intermediate_filename_root = '_trj' # A placeholder name #dtm does not play nice with OpenMP use_parallel_rmsd = (num_proc != 'use_dtm_instead') jobs = [] for i, clone_dir in enumerate( self.yield_xtc_directories(input_dir, input_style)): job = { 'clone_dir': clone_dir, 'output_dir': output_dir, 'pdb_file': self.pdb_topology, 'trajectory_number': i, 'stride': stride, 'max_rmsd': max_rmsd, 'min_gens': min_gens, 'center_conformations': center_conformations, 'memory_check': update, 'omp_parallel_rmsd': use_parallel_rmsd } jobs.append(job) if len(jobs) == 0: raise RuntimeError('No conversion jobs found!') if num_proc == 'use_dtm_instead': # use DTM mpi parallel map dtm.map(self.write_trajectory_mapper, jobs) elif num_proc > 1: # use multiprocessing pool = Pool(processes=num_proc) pool.map(self.write_trajectory_mapper, jobs) else: # use regular serial execution map(self.write_trajectory_mapper, jobs) # Rename trajectory files such that they have contiguous numbering logger.info( "Finished Generating Trajectories. Renaming them now in contiguous order" ) mapping = { } # document the directory changes, allowing us to update memory for i, filename in enumerate(sorted(os.listdir(output_dir), key=keynat)): path = os.path.join(output_dir, filename) new_path = os.path.join(output_dir, "trj%d.lh5" % i) os.rename(path, new_path) mapping[path] = new_path # update the memory hash to accound for our renumbering for key in self.memory.keys(): if key not in ['convert_parameters', 'SerializerFilename']: logger.info("%s --> %s", self.memory[key][0], mapping[self.memory[key][0]]) self.memory[key][0] = mapping[self.memory[key][0]] # save the parameters used for this run in the memory file, and write to disk logger.info("Generating Project File: %s", self.projectinfo_file) if update: try: os.remove(self.projectinfo_file ) # if we are updating, just start w fresh slate except: pass self.memory['convert_parameters'] = (input_dir, output_dir, stride, max_rmsd, min_gens, center_conformations, num_proc, self.projectinfo_file, input_style) Project.CreateProjectFromDir(Filename=self.projectinfo_file, TrajFilePath=output_dir, TrajFileBaseName='trj', TrajFileType='.lh5', ConfFilename=self.pdb_topology, initial_memory=cPickle.dumps(self.memory)) logger.info("Data converted properly.") return
def main(options): if options.gene is None: #options.gene = x = list(map(str.strip, open(base + "/lovci/projects/FOX2/FOX2_human_brain/CLIP/analysis_gsnap/bound_genes.slop.p05.t0.intron_only.txt").readlines()))[:-1] #if options.species == "hg19": #options.gene = x = list(map(str.strip, open(base + "/lovci/projects/conservation/hg19/mammal_cons/ultra_allIntron.genes.txt").readlines()))[:-1] #if options.species == "mm9": # options.gene = x = list(map(str.strip, open(base + "/lovci/projects/conservation/mm9/ultra_allIntron.genes.txt").readlines()))[:-1] genelist = get_names(options.db) #print "which genes to run?... this will take awhile" else: genelist = options.gene #mongoport = "8585" #mongo = Popen(["ssh", "-L", ("%s:localhost:%s" %(mongoport, mongoport)), "oolite", "-N"]) #connect to mongo db if options.max_genes is not None: if not len( genelist ) < options.max_genes: #already < max_genes genes in genelist import random #sample a random subset genelist = random.sample(genelist, options.max_genes) if not os.path.exists(options.outdir): print "Creating output directory %s" % (options.outdir) os.mkdir(options.outdir) if options.serial is True: for gene in genelist: geneLinks = fold_a_dir(gene, rewrite=options.rewrite, mfe_cutoff=options.mfe_cutoff, dir=options.dbdir, species=options.species, outdir=options.outdir) if geneLinks == None or len(geneLinks) == 0: print "There were no links found for %s" % (gene) continue if options.PET is True: if not options.species == "hg19": print "only hg19 works for PET" raise Exception tree = (base + "/lovci/projects/structure/hg19/PET_test/tree") from Bio import Phylo leaves = Phylo.read(tree, 'newick') speciesList = "-".join( [i.name for i in leaves.get_terminals()]) del leaves import conserved_structure with conserved_structure.OverlapWith( geneLinks, options.proxCons, options.distCons) as conservedOverlappers: if conservedOverlappers.enter_ok is True: conservedOverlappers = conservedOverlappers.names else: if geneLinks is None or len(geneLinks) == 0: print "There are no links for this gene: %s" % ( gene) else: print "errors overlapping conserved regiosn with this gene %s that has this many links: %d" % ( gene, len(geneLinks)) raise Exception print "There are %d Links overlapping conserved regionsin gene %s" % ( len(conservedOverlappers), gene) for link in geneLinks: if not link.name in conservedOverlappers: continue linkObject = conserved_structure.RNApair(link) linkObject.multiZ("hg19", "hg19_46", speciesList) linkObject.aliFasta(prefix=linkObject.name) linkObject.PETcofold(tree=tree) if linkObject.PETcofoldScore == "error": print "Error finding PETcofold score for %s" % ( linkObject.name) else: print "%s has PETcofold score %f" % ( linkObject.name, linkObject.PETcofoldScore) with conserved_structure.MongoConn( "compute-2-2", 8585, 'RNAlinkDB', 'ConsLinks') as DBcon: if DBcon.enter_ok is True: DBcon = DBcon.con DBcon.save(linkObject.__dict__, safe=True) print "Saved %s" % (linkObject.name) else: print "There was a problem storing %s" % ( linkObject.name) raise Exception else: dtm.map(fold_a_dir, genelist, rewrite=options.rewrite, mfe_cutoff=options.mfe_cutoff, dir=options.dbdir, species=options.species, outdir=options.outdir)