def run_mrbayes(self, ipyclient, force=False, quiet=False): """ calls the mrbayes block in each nexus file. """ ## get all the nexus files for this object minidir = os.path.realpath(os.path.join(self.workdir, self.name)) nexus_files = glob.glob(os.path.join(minidir, "*.nex")) ## clear existing files #existing = glob.glob(os.path.join(self.workdir, self.name, "*.nex")) existing = glob.glob(os.path.join(minidir, "*.nex.*")) if any(existing): if force: for rfile in existing: os.remove(rfile) else: raise IPyradWarningExit(EXISTING_NEXdot_FILES.format(minidir)) ## write new nexus files, or should users do that before this? #self.write_nexus_files(force=True) ## load balancer lbview = ipyclient.load_balanced_view() ## submit each to be processed asyncs = [] for nex in nexus_files: async = lbview.apply(_call_mb, nex) asyncs.append(async) ## track progress start = time.time() printstr = "[mb] infer gene-tree posteriors | {} | " while 1: ready = [i.ready() for i in asyncs] elapsed = datetime.timedelta(seconds=int(time.time() - start)) if not quiet: progressbar(len(ready), sum(ready), printstr.format(elapsed), spacer="") if len(ready) == sum(ready): if not quiet: print("") break else: time.sleep(0.1) ## check success for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def run_mbsum(self, ipyclient, force=False, quiet=False): """ Sums two replicate mrbayes runs for each locus """ minidir = os.path.realpath(os.path.join(self.workdir, self.name)) trees1 = glob.glob(os.path.join(minidir, "*.run1.t")) trees2 = glob.glob(os.path.join(minidir, "*.run2.t")) ## clear existing files existing = glob.glob(os.path.join(self.workdir, self.name, "*.sumt")) if any(existing): if force: for rfile in existing: os.remove(rfile) else: path = os.path.join(self.workdir, self.name) raise IPyradWarningExit(EXISTING_SUMT_FILES.format(path)) ## load balancer lbview = ipyclient.load_balanced_view() ## submit each to be processed asyncs = [] for tidx in xrange(len(trees1)): rep1 = trees1[tidx] rep2 = trees2[tidx] outname = os.path.join(minidir, str(tidx) + ".sumt") async = lbview.apply(_call_mbsum, *(rep1, rep2, outname)) asyncs.append(async) ## track progress start = time.time() printstr = "[mbsum] sum replicate runs | {} | " while 1: ready = [i.ready() for i in asyncs] elapsed = datetime.timedelta(seconds=int(time.time() - start)) if not quiet: progressbar(len(ready), sum(ready), printstr.format(elapsed), spacer="") if len(ready) == sum(ready): if not quiet: print("") break else: time.sleep(0.1) ## check success for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def batch( baba, ipyclient=None, ): """ distributes jobs to the parallel client """ ## parse args handle = baba.data taxdicts = baba.tests mindicts = baba.params.mincov nboots = baba.params.nboots ## if ms generator make into reusable list sims = 0 if isinstance(handle, types.GeneratorType): handle = list(handle) sims = 1 else: ## expand locifile path to full path handle = os.path.realpath(handle) ## parse taxdicts into names and lists if it a dictionary #if isinstance(taxdicts, dict): # names, taxdicts = taxdicts.keys(), taxdicts.values() #else: # names = [] names = [] if isinstance(taxdicts, dict): taxdicts = [taxdicts] ## an array to hold results (len(taxdicts), nboots) tot = len(taxdicts) resarr = np.zeros((tot, 7), dtype=np.float64) bootsarr = np.zeros((tot, nboots), dtype=np.float64) paneldict = {} ## TODO: Setup a wrapper to find and cleanup ipyclient ## define the function and parallelization to use, ## if no ipyclient then drops back to using multiprocessing. if not ipyclient: # ipyclient = ip.core.parallel.get_client(**self._ipcluster) raise IPyradError("you must enter an ipyparallel.Client() object") else: lbview = ipyclient.load_balanced_view() ## submit jobs to run on the cluster queue start = time.time() asyncs = {} idx = 0 ## prepare data before sending to engines ## if it's a str (locifile) then parse it here just once. if isinstance(handle, str): with open(handle, 'r') as infile: loci = infile.read().strip().split("|\n") if isinstance(handle, list): pass #sims() ## iterate over tests (repeats mindicts if fewer than taxdicts) itests = iter(taxdicts) imdict = itertools.cycle([mindicts]) #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])): for i in xrange(len(ipyclient)): ## next entries unless fewer than len ipyclient, skip try: test = next(itests) mindict = next(imdict) except StopIteration: continue ## if it's sim data then convert to an array if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 ## block until finished, print progress if requested. finished = 0 try: while 1: keys = [i for (i, j) in asyncs.items() if j.ready()] ## check for failures for job in keys: if not asyncs[job].successful(): raise IPyradWarningExit(\ " error: {}: {}".format(job, asyncs[job].exception())) ## enter results for successful jobs else: _res, _bot = asyncs[job].result() ## store D4 results if _res.shape[0] == 1: resarr[job] = _res.T.as_matrix()[:, 0] bootsarr[job] = _bot ## or store D5 results else: paneldict[job] = _res.T ## remove old job del asyncs[job] finished += 1 ## submit next job if there is one. try: test = next(itests) mindict = next(imdict) if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 except StopIteration: pass ## count finished and break if all are done. #fin = idx - len(asyncs) elap = datetime.timedelta(seconds=int(time.time() - start)) printstr = " calculating D-stats | {} | " progressbar(tot, finished, printstr.format(elap), spacer="") time.sleep(0.1) if not asyncs: print("") break except KeyboardInterrupt as inst: ## cancel all jobs (ipy & multiproc modes) and then raise error try: ipyclient.abort() except Exception: pass raise inst ## dress up resarr as a Pandas DataFrame if 4-part test if len(test) == 4: if not names: names = range(len(taxdicts)) #print("resarr") #print(resarr) resarr = pd.DataFrame(resarr, index=names, columns=[ "dstat", "bootmean", "bootstd", "Z", "ABBA", "BABA", "nloci" ]) ## sort results and bootsarr to match if test names were supplied resarr = resarr.sort_index() order = [list(resarr.index).index(i) for i in names] bootsarr = bootsarr[order] return resarr, bootsarr else: ## order results dfs listres = [] for key in range(len(paneldict)): listres.append(paneldict[key]) ## make into a multi-index dataframe ntests = len(paneldict) multi_index = [ np.array([[i] * 3 for i in range(ntests)]).flatten(), np.array(['p3', 'p4', 'shared'] * ntests), ] resarr = pd.DataFrame( data=pd.concat(listres).as_matrix(), index=multi_index, columns=listres[0].columns, ) return resarr, None
def run_bucky(self, ipyclient, force=False, quiet=False, subname=False): """ Runs bucky for a given set of parameters and stores the result to the ipa.bucky object. The results will be stored by default with the name '{name}-{alpha}' unless a argument is passed for 'subname' to customize the output name. Parameters: ----------- subname (str): A custom name prefix for the output files produced by the bucky analysis and output into the {workdir}/{name} directory. force (bool): If True then existing result files with the same name prefix will be overwritten. quiet (bool): If True the progress bars will be suppressed. ipyclient (ipyparallel.Client) An active ipyparallel client to distribute jobs to. """ ## check for existing results files minidir = os.path.realpath(os.path.join(self.workdir, self.name)) infiles = glob.glob(os.path.join(minidir, "*.sumt")) outroot = os.path.realpath(os.path.join(self.workdir, self.name)) ## build alpha list if isinstance(self.params.bucky_alpha, list): alphas = self.params.bucky_alpha else: alphas = [self.params.bucky_alpha] ## load balancer lbview = ipyclient.load_balanced_view() ## submit each to be processed asyncs = [] for alpha in alphas: pathname = os.path.join(outroot, "CF-a" + str(alpha)) if (os.path.exists(pathname)) and (force != True): print("BUCKy results already exist for this object at alpha={}\n".format(alpha) +\ "use force=True to overwrite existing results") else: args = [ alpha, self.params.bucky_nchains, self.params.bucky_nreps, self.params.bucky_niter, pathname, infiles ] async = lbview.apply(_call_bucky, *args) asyncs.append(async) ## track progress start = time.time() printstr = "[bucky] infer CF posteriors | {} | " while 1: ready = [i.ready() for i in asyncs] elapsed = datetime.timedelta(seconds=int(time.time() - start)) if not quiet: progressbar(len(ready), sum(ready), printstr.format(elapsed), spacer="") if len(ready) == sum(ready): if not quiet: print("") break else: time.sleep(0.1) ## check success for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def _submit_jobs(self, force, ipyclient, name_fields, name_separator, dry_run): """ Download the accessions into a the designated workdir. If file already exists it will only be overwritten if force=True. Temporary files are removed. """ ## get Run data with default fields (1,4,6,30) df = self.fetch_runinfo(range(31), quiet=True) sys.stdout.flush() ## if not ipyclient then use multiprocessing if ipyclient: lb = ipyclient.load_balanced_view() ## if Run has samples with same name (replicates) then ## we need to include the accessions in the file names if name_fields: ## indexing requires -1 ints fields = [int(i) - 1 for i in fields_checker(name_fields)] ## make accession names, no spaces allowed df['Accession'] = pd.Series(df[df.columns[fields[0]]], index=df.index) for field in fields[1:]: df.Accession += name_separator + df[df.columns[field]] df.Accession = [i.replace(" ", "_") for i in df.Accession] ## check that names are unique if not df.Accession.shape[0] == df.Accession.unique().shape[0]: raise IPyradWarningExit("names are not unique:\n{}"\ .format(df.Accession)) ## backup default naming scheme else: if len(set(df.SampleName)) != len(df.SampleName): accs = (i + "-" + j for i, j in zip(df.SampleName, df.Run)) df.Accession = accs else: df.Accession = df.SampleName if dry_run: print("\rThe following files will be written to: {}".format( self.workdir)) print("{}\n".format(df.Accession)) else: ## iterate over and download asyncs = [] for idx in df.index: ## get args for this run srr = df.Run[idx] outname = df.Accession[idx] paired = df.spots_with_mates.values.astype( int).nonzero()[0].any() fpath = os.path.join(self.workdir, outname + ".fastq.gz") ## skip if exists and not force skip = False if force: if os.path.exists(fpath): os.remove(fpath) else: if os.path.exists(fpath): skip = True sys.stdout.flush() print("[skip] file already exists: {}".format(fpath)) ## single job progress bar tidx = df.Accession.shape[0] #if not ipyclient: ## submit job to run if not skip: args = (self, srr, outname, paired) if ipyclient: async = lb.apply_async(call_fastq_dump_on_SRRs, *args) asyncs.append(async) else: print("Downloading file {}/{}: {}".format( idx + 1, tidx, fpath)) call_fastq_dump_on_SRRs(*args) sys.stdout.flush() ## progress bar while blocking parallel if ipyclient: tots = df.Accession.shape[0] printstr = " Downloading fastq files | {} | " start = time.time() while 1: elapsed = datetime.timedelta(seconds=int(time.time() - start)) ready = sum([i.ready() for i in asyncs]) progressbar(tots, ready, printstr.format(elapsed), spacer="") time.sleep(0.1) if tots == ready: print("") break self._report(tots) ## check for fails for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def _prun(self, force=False, ipyclient=None): """ Download the accessions into a the designated workdir. If file already exists it will only be overwritten if force=True. Temporary files are removed. """ ## ensure output directory if not os.path.exists(self.workdir): os.makedirs(self.workdir) ## TODO: parallelize with ipyclient... lbview = ipyclient.load_balanced_view() ## wrap in a try statement to shutdown on interrupt try: ## download files if self.is_project: ## get Run data srrs, accs = self.fetch_runinfo() ## if Run has samples with same name (replicates) then ## we need to include the accessions in the file names if len(set(accs)) != len(accs): accs = (i + "-" + j for i, j in zip(accs, srrs)) ## iterate over and download skipped = 0 asyncs = [] start = time.time() for srr, acc in zip(srrs, accs): ## clean up acc if it is not nicely formatted ## i.e., do not allow spaces, ... acc = acc.replace(" ", "_")\ .replace('"', "")\ .replace("'", "") ## print filename fpath = os.path.join(self.workdir, acc + ".fastq.gz") self._accession = srr ## skip if exists and not force skip = False if force: if os.path.exists(fpath): os.remove(fpath) else: if os.path.exists(fpath): skip = True skipped += 1 if not skip: async = lbview.apply(_call_fastq_dump_on_SRRs, *(self, acc)) asyncs.append(async) if skipped: print("\nSkipping {} samples already present in workdir"\ .format(skipped)) tots = len(srrs) printstr = " Downloading fastq files | {} | " while 1: elapsed = datetime.timedelta(seconds=int(time.time() - start)) ready = sum([i.ready() for i in asyncs]) progressbar(tots, ready, printstr.format(elapsed), spacer="") time.sleep(0.1) if tots == ready: print("") break self._report(tots) ## check for fails for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result()) else:
def inference(data, ipyclient, bidx): """ run inference and store results """ ## a distributor of chunks njobs = sum(1 for _ in iter(xrange(data.svd.checkpoint_arr, data.svd.nquarts, data.svd.chunk))) jobiter = iter(xrange(data.svd.checkpoint_arr, data.svd.nquarts, data.svd.chunk)) #LOGGER.info("chunksize: %s, start: %s, total: %s, njobs: %s", \ # data.svd.chunk, data.svd.checkpoint_arr, data.svd.nquarts, njobs) ## make a distributor for engines lbview = ipyclient.load_balanced_view() #LOGGER.info("sending jobs to %s Engines", len(ipyclient)) ## open a view to the super h5 array with h5py.File(data.svd.h5out, 'w') as out5: out5.create_dataset("quartets", (data.svd.nquarts, 4), dtype=np.uint16, chunks=(data.svd.chunk, 4)) out5.create_dataset("weights", (data.svd.nquarts,), dtype=np.float16, chunks=(data.svd.chunk,)) ## submit initial n jobs assert len(ipyclient) > 0, "No ipyparallel Engines found" res = {} for i in range(len(ipyclient)): try: res[i] = lbview.apply(worker, [data, jobiter.next()]) except StopIteration: continue ## iterate over remaining jobs keys = res.keys() finished = 0 while res.keys(): time.sleep(1) if not bidx: progressbar(njobs, finished) for key in keys: try: ## query for finished results result = res[key].get(0) ## put it into the super array insert_to_array(data, result) ## delete result, update checkpoint del res[key] finished += 1 ## update the minimum quartets finished/filled. with h5py.File(data.svd.h5out, 'r') as tmp5: ww = tmp5["weights"][:] try: data.svd.checkpoint_arr = np.where(ww == 0)[0].min() #LOGGER.info("arr saved at %s", data.svd.checkpoint_arr) except (ValueError, AttributeError): ## array is full (no zeros) pass ## submit new jobs try: res[key] = lbview.apply(worker, [data, jobiter.next()]) #LOGGER.info("new job added to Engine %s", key) except StopIteration: continue except (ipp.error.TimeoutError, KeyError): continue if not bidx: progressbar(njobs, finished) print("") ## convert to txt file for wQMC dump(data) ## run quartet joining algorithm if not bidx: run_qmc(data, boot=0) else: run_qmc(data, boot=1) ## reset the checkpoint_arr data.svd.checkpoint_arr = 0
def run(data, nboots, method, nquarts, force, ipyclient): """ Run svd4tet inference on a sequence or SNP alignment for all samples the Assembly. By default the job starts from 0 or where it last left off, unless force=True, then it starts from 0. """ ## load svd attributes if they exist fresh = 0 if not force: try: if data.svd.checkpoint_boot or data.svd.checkpoint_arr: print(" loading from svd checkpoint") print(" array checkpoint: {}".format(data.svd.checkpoint_arr)) print(" boots checkpoint: {}".format(data.svd.checkpoint_boot)) print(" sampling method: {}".format(data.svd.method)) ## require method to be same as loaded type assert method == data.svd.method, \ "loaded object method={}, cannot change methods midstream"+\ " use force argument to start new run with new method." else: fresh = 1 except (AttributeError, IOError): fresh = 1 ## if svd results do not exist or force then restart if force or fresh: ## make an analysis directory if it doesn't exist data.dirs.svd = os.path.realpath( os.path.join( data.dirs.project, data.name+"_analysis_svd")) if not os.path.exists(data.dirs.svd): try: os.mkdir(data.dirs.svd) except OSError: ## if not there then create new svd directory data.dirs.svd = os.path.join( os.path.curdir, data.name+"_analysis_svd") os.mkdir(data.dirs.svd) print(" output directory created at: {}".format(data.dirs.svd)) ## init new svd ObjDict data = svd_obj_init(data, method) ## get the real seq array into hdf5 h5in data = get_seqarray(data, boot=False) ## make quartet arrays into hdf5. Allow subsetting samples eventually. ## and optimize chunk value given remaining quartets and ipyclient if method == "equal": ## print equal header print(" loading {} random quartet samples for starting tree inference"\ .format(nquarts)) ## grab test number for starting tree data = get_quartets(data, method, nquarts, ipyclient) print(" inferring {} x 3 quartet trees for starting tree"\ .format(nquarts)) ## infer starting tree inference(data, ipyclient, bidx=0) ## sample quartets from starting tree print(" loading {} equal-splits quartets from starting tree"\ .format(nquarts)) data = equal_splits(data, nquarts, ipyclient) ## remove starting tree tmp files tmps = [data.svd.tre, data.svd.wtre, data.svd.tboots, data.svd.wboots, data.svd.btre, data.svd.bwtre] for tmp in tmps: try: os.remove(tmp) except OSError: continue ## will sample all or random set of quartets else: if method == "random": print(" loading {} random quartet samples"\ .format(nquarts)) else: nquarts = n_choose_k(len(data.samples), 4) print(" loading all {} possible quartets"\ .format(nquarts)) data = get_quartets(data, method, nquarts, ipyclient) ## run the full inference if not data.svd.checkpoint_boot: print(" inferring {} x 3 quartet trees".format(nquarts)) inference(data, ipyclient, bidx=0) else: print(" full inference finished") progressbar(20, 20) ## run the bootstrap replicates if nboots: print(" running {} bootstrap replicates".format(nboots)) ## get current boot for bidx in range(data.svd.checkpoint_boot, nboots): if data.svd.checkpoint_arr == 0: data = get_seqarray(data, boot=True) #LOGGER.info(" new boot array sampled") data.svd.checkpoint_boot = bidx ## start boot inference progressbar(nboots, bidx) inference(data, ipyclient, bidx=True) progressbar(20, 20) ## write outputs with bootstraps write_outputs(data, with_boots=1) else: ## write outputs without bootstraps write_outputs(data, with_boots=0)