def main(): client = Client(n_workers=10, threads_per_worker=1) print(client) df = dask.datasets.timeseries( start="2000-01-01", end="2000-01-31", # end="2000-12-31", partition_freq="1h", freq="60s", ) df = df.persist() wait(df) iterations = 10 with performance_report(filename=f"{today}-simple-scheduler.html"): simple = [] # print('start simple: ', flush=True) for i in range(iterations): start = time.time() z = df.x + 1 + 2 - df.y z.sum().compute() stop = time.time() simple.append(stop - start) simple = np.array(simple) df2 = None with performance_report(filename=f"{today}-shuffle-scheduler.html"): shuffle_t = [] # print('start shuffle: ', flush=True) for i in range(iterations): client.cancel(df2) start = time.time() # shuffle(df, "id", shuffle="tasks") df2 = df.set_index("id").persist() wait(df2) stop = time.time() shuffle_t.append(stop - start) shuffle_t = np.array(shuffle_t) with performance_report(filename=f"{today}-rand-access-scheduler.html"): rand_access = [] for i in range(iterations): start = time.time() df2.head() stop = time.time() rand_access.append(stop - start) rand_access = np.array(rand_access) data = dsa.random.random((10000, 1000000), chunks=(1, 1000000)) da = xr.DataArray(data, dims=['time', 'x'], coords={'day': ('time', np.arange(10000) % 100)}) clim = da.groupby('day').mean(dim='time') anom = da.groupby('day') - clim anom_mean = anom.mean(dim='time') with performance_report(filename=f"{today}-anom-mean-scheduler.html"): anom_mean_t = [] for i in range(iterations): start = time.time() anom_mean.compute() stop = time.time() anom_mean_t.append(stop - start) anom_mean_t = np.array(anom_mean_t) return dict(simple=simple, shuffle=shuffle_t, rand_access=rand_access, anom_mean=anom_mean_t)
class Fask: def __init__(self, **kwa): cfg = kwa.get('cfg') loglevel = dict( debug=logging.DEBUG, info=logging.INFO, warn=logging.WARN, error=logging.ERROR, ).get( cfg.get('loglevel'), logging.INFO, ) self.reset() handler = logging.FileHandler( '%s/../log/fask.log' % os.path.dirname(os.path.realpath(__file__))) handler.setFormatter( logging.Formatter( fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s')) self.logger = logging.getLogger('fask') self.logger.addHandler(handler) self.logger.setLevel(loglevel) self.cluster = LocalCluster( n_workers=cfg.get('processes'), processes=True, threads_per_worker=cfg.get('threads'), # make bokeh available outside of a docker container too # see: https://github.com/dask/distributed/issues/1875 # well, that was a fun two hours (for moderate values of "fun" ;-) ip='', ) self.client = Client(self.cluster) self.setup_signals(timeout=cfg.get('timeout')) self.logger.info('ready') # signalling fask to stop (SIGINT, SIGALRM) will raise SystemExit # try: self.run() except SystemExit: self.logger.info('done') ### ### sysprog def log_status(self, caller='xxx'): self.logger.info( '[status at {caller}] F {futures} | dF {done_futures} | cF {cancelled_future}| !cdF {not_cancelled_and_done} | CS {cs} | RC {rc}' .format( caller=caller, futures=len(self.futures), done_futures=len( list(filter(lambda f: f.done() == True, self.futures))), cancelled_future=len( list(filter(lambda f: f.cancelled() == True, self.futures))), not_cancelled_and_done=len( list( filter( lambda f: f.cancelled() == False and f.done() == True, self.futures))), cs=self.calculations_submitted, rc=self.results_collected, )) def cleanup(self): self.logger.info('cleaning up') self.log_status('cleanup') self.collect_results(all=False) # Client.cancel() # This stops future tasks from being scheduled if they have not yet # run and deletes them if they have already run. After calling, this # result and all dependent results will no longer be accessible self.logger.info('cancel all futures') # XXX # cancelling a future also markes it done self.client.cancel(self.futures) def setup_signals(self, **kwa): signal.signal(signal.SIGINT, self.handler_sigint) if kwa.get('timeout'): signal.signal(signal.SIGALRM, self.handler_sigalrm) signal.alarm(kwa.get('timeout')) def handler_sigint(self, signum, frame): self.logger.warning('exit because of SIGINT') self.cleanup() self.bailout() def handler_sigalrm(self, signum, frame): self.logger.warning('exit because of SIGALRM') self.cleanup() self.bailout() def bailout(self): self.logger.warning('bailing out') self.log_status('bailout') sys.exit(0) def reset(self): self.calculations_submitted = 0 self.results_collected = 0 self.results = [] self.futures = [] ### ### worker def run(self): """ submit all given calculations """ self.reset() if not len(self.calculations()): raise (LookupError, 'no calculations available') for ci, c in enumerate(self.calculations()): future = self.client.submit(c, pure=False) self.logger.debug('[{ci}] future {key} submitted'.format( ci=ci, key=future.key)) self.futures.append(future) self.calculations_submitted += 1 self.log_status('run') self.collect_results(all=True) def collect_results(self, **kwa): """ collect (and log) results as they become available (this will block) """ if kwa.get('all'): self.logger.info('collect all results') futures = as_completed(self.futures) else: self.logger.info('collect already done results only') futures = filter(lambda f: f.done() == True, self.futures) # for xi, future in enumerate (as_completed (self.futures)): for xi, future in enumerate(futures): self.results_collected += 1 result = future.result() key = future.key # future.cancel() self.logger.debug('[{xi}] future {key} yielded {result}'.format( xi=xi, key=key, result=result)) self.results.append(dict( index=xi, result=result, )) self.log_status('collect_results') def calculations(self): """ overwrite this virtual method this is where your actual code goes OUT: a list of functions to run """ raise NotImplementedError('virtual method run() not implemented')
def beta_parallel_disk_detection(dataset, probe, #rxmin=None, # these would allow selecting a sub section #rxmax=None, #rymin=None, #rymax=None, #qxmin=None, #qxmax=None, #qymin=None, #qymax=None, probe_type="FT", dask_client= None, dask_client_params:dict=None, restart_dask_client=True, close_dask_client=False, return_dask_client=True, *args, **kwargs): """ This is not fully validated currently so may not work, please report bugs on the py4DSTEM github page. This parallellises the disk detetection for all probe posistions. This can operate on either in memory or out of memory datasets There is an asumption that unless specifying otherwise you are parallelising on a single Local Machine. If this is not the case its probably best to pass the dask_client into the function, although you can just pass the required arguments to dask_client_params. If no dask_client arguments are passed it will create a dask_client for a local machine Note: Do not pass "peaks" argument as a kwarg, like you might in "_find_Bragg_disks_single_DP_FK", as the results will be unreliable and may cause the calculation to crash. Args: dataset (py4dSTEM datacube): 4DSTEM dataset probe (ndarray): can be regular probe kernel or fourier transormed probe_type (str): "FT" or None dask_client (distributed.client.Client): dask client dask_client_params (dict): parameters to pass to dask client or dask cluster restart_dask_client (bool): if True, function will attempt to restart the dask_client. close_dask_client (bool): if True, function will attempt to close the dask_client. return_dask_client (bool): if True, function will return the dask_client. *args,kwargs will be passed to "_find_Bragg_disks_single_DP_FK" e.g. corrPower, sigma, edgeboundary... Returns: peaks (PointListArray): the Bragg peak positions and the correlenation intensities dask_client(optional) (distributed.client.Client): dask_client for use later. """ #TODO add asserts abotu peaks not being passed # Dask Client stuff #TODO how to guess at default params for client, sqrt no.cores. Something to do with the size of the diffraction patterm # write a function which can do this. #TODO replace dask part with a with statement for easier clean up e.g. # with LocalCluser(params) as cluster, Client(cluster) as client: # ... dask stuff. #TODO add assert statements and other checks. Think about reordering opperations if dask_client == None: if dask_client_params !=None: dask.config.set({'distributed.worker.memory.spill': False, 'distributed.worker.memory.target': False}) cluster = LocalCluster(**dask_client_params) dask_client = Client(cluster, **dask_client_params) else: # AUTO MAGICALLY SET? # LET DASK SET? # HAVE A FUNCTION WHICH RUNS ON A SUBSET OF THE DATA TO PICK OPTIMIAL VALUE? # psutil could be used to count cores. dask.config.set({'distributed.worker.memory.spill': False, # stops spilling to disk 'distributed.worker.memory.target': False}) # stops spilling to disk and erroring out cluster = LocalCluster() dask_client = Client(cluster) else: assert type(dask_client) == distributed.client.Client if restart_dask_client: try: dask_client.restart() except Exception as e: print('Could not restart dask client. Try manually restarting outside or passing "restart_dask_client=False"') # WARNING STATEMENT return e else: pass # Probe stuff assert (probe.shape == dataset.data.shape[2:]), "Probe and Diffraction Pattern Shapes are Mismatched" if probe_type != "FT": #TODO clean up and pull out redudant parts #if probe.dtype != (np.complex128 or np.complex64 or np.complex256): #DO FFT SHIFT THING probe_kernel_FT = np.conj(np.fft.fft2(probe)) dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny)) dask_probe_delayed = dask_probe_array.to_delayed() # delayed_probe_kernel_FT = delayed(probe_kernel_FT) else: probe_kernel_FT = probe dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny)) dask_probe_delayed = dask_probe_array.to_delayed() # GET DATA #TODO add another elif if it is a dask array then pass if type(dataset.data) == np.ndarray: dask_data = da.from_array(dataset.data, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny)) elif dataset.stack_pointer != None: dask_data = da.from_array(dataset.stack_pointer, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny)) else: print("Couldn't access the data") return None # Convert the data to delayed dataset_delayed = dask_data.to_delayed() # TODO Trim data e.g. rx,ry,qx,qy # I can pass the index values in here I should trim the probe and diffraction pattern first # Into the meat of the function # create an empty list to which we will append the dealyed functions to. res = [] # loop over the dataset_delayed and create a delayed function of for x in np.ndindex(dataset_delayed.shape): temp = delayed(_find_Bragg_disks_single_DP_FK_dask_wrapper)(dataset_delayed[x], probe_kernel_FT=dask_probe_delayed[0,0], #probe_kernel_FT=delayed_probe_kernel_FT, *args, **kwargs) #passing through args from earlier or should I use #corrPower=corrPower, #sigma=sigma_gaussianFilter, #edgeBoundary=edgeBoundary, #minRelativeIntensity=minRelativeIntensity, #minPeakSpacing=minPeakSpacing, #maxNumPeaks=maxNumPeaks, #subpixel='poly') res.append(temp) _temp_peaks = dask_client.compute(res, optimize_graph=True) # creates futures and starts computing output = dask_client.gather(_temp_peaks) # gather the future objects coords = [('qx',float),('qy',float),('intensity',float)] peaks = PointListArray(coordinates=coords, shape=dataset.data.shape[:-2]) #temp_peaks[0][0] # operating over a list so we need the size (0->count) and re-create the probe positions (0->rx,0->ry), for (count,(rx, ry)) in zip([i for i in range(dataset.data[...,0,0].size)],np.ndindex(dataset.data.shape[:-2])): #peaks.get_pointlist(rx, ry).add_pointlist(temp_peaks[0][count]) #peaks.get_pointlist(rx, ry).add_pointlist(output[count][0]) peaks.get_pointlist(rx, ry).add_pointlist(output[count]) # Clean up dask_client.cancel(_temp_peaks) # removes from the dask workers del _temp_peaks # deletes the object if close_dask_client: dask_client.close() return peaks elif close_dask_client == False and return_dask_client == True: return peaks, dask_client elif close_dask_client and return_dask_client == False: return peaks else: print('Dask Client in unknown state, this may result in unpredicitable behaviour later') return peaks
from dask.datasets import timeseries import time from dask.dataframe.shuffle import shuffle from dask.distributed import Client, wait if __name__ == "__main__": client = Client("127.0.0.1:8786") ddf_h = timeseries(start='2000-01-01', end='2000-01-02', partition_freq='1min') for i in range(5): print("Iteration: ", i) result = shuffle(ddf_h, "id", shuffle="tasks") ddf = client.persist(result) _ = wait(ddf) client.cancel(ddf) client.cancel(result) client.shutdown() time.sleep(0.5)
def main(vocabSize, tokenFile, outputFileName): #### CREATING VOCAB wordCount = pickle.load(open('wordCount', 'rb')) vocab = wordCount.most_common(vocabSize) ## Creating the Word-ID dictionaries id_to_word = {i: x[0] for i, x in enumerate(vocab)} word_to_id = {value: key for key, value in id_to_word.items()} wordSet = set(word_to_id.keys()) #### DASK PROCESS client = Client() print(client) def createCMatrix(corpus): windowSize = 10 cooccurrences = sparse.lil_matrix((vocabSize, vocabSize), dtype=np.float64) for doc in corpus: for center_index, center_word in enumerate(doc): if center_word not in wordSet: continue context = doc[max(0, center_index - windowSize):center_index] contextLen = len(context) for context_index, context_word in enumerate(context): dist = contextLen - context_index inc = 1.0 / float(dist) if context_word in wordSet: cooccurrences[word_to_id[center_word], word_to_id[context_word]] += inc cooccurrences[word_to_id[context_word], word_to_id[center_word]] += inc # center_id = word_to_id[center_word] # context_id = word_to_id[context_word] # if center_id<context_id: # cooccurrences[center_id, context_id] += inc # else: # cooccurrences[context_id, center_id] += inc return cooccurrences def split(a, n): k, m = divmod(len(a), n) return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)) corpus = pickle.load(open(tokenFile, 'rb')) matrices = [] print("Starting process") for sub in tqdm(split(corpus, 10)): a = client.map(createCMatrix, list(split(sub, 16))) b = client.gather(a) mat = reduce(lambda x, y: x + y, b) matrices.append(mat.copy()) client.cancel(a) client.cancel(b) del a del b print("Creating Final Cooccurence matrix") finalMat = reduce(lambda x, y: x + y, matrices) with open(outputFileName, 'wb') as f: pickle.dump(finalMat, f) client.shutdown()
class Remote(object): """ Remote. Args: address (str): Remote scheduler address formed by `ip:port`. tls_ca_file (str, optional): TLS CA certificate file path. Defaults to None. tls_client_cert (str, optional): TLS certificate file path. Defaults to None. tls_client_key (str, optional): TLS private key file path. Defaults to None. require_encryption (bool, optional): Encrypt data exchange. Defaults to False. Note: TLS will be enabled only if all three TLS arguments are provided. Remember to change network protocol to `tls://<address>`. """ def __init__(self, address: str, tls_ca_file: str = None, tls_client_cert: str = None, tls_client_key: str = None, require_encryption: bool = False): # authentication sec = None if tls_ca_file and tls_client_cert and tls_client_key: sec = Security(tls_ca_file=tls_ca_file, tls_client_cert=tls_client_cert, tls_client_key=tls_client_key, require_encryption=require_encryption) # init self._client = Client(address=address, security=sec) self._client.register_worker_callbacks(Remote._worker_startup) @staticmethod def _worker_startup(dask_worker: Worker): os.chdir(dask_worker.local_dir) def add_dependencies(self, files): """ Add list of dependencies, order matters. Args: files (list): List of dependent files. """ # TODO: automatically resolve module dependencies if isinstance(files, str): files = [files] for f in files: self._client.upload_file(f) def scatter(self, *args, **kwargs): """ Scatter data. """ return self._client.scatter(*args, **kwargs) def submit(self, func, *args, **kwargs): """ Submit function and data. Args: func (callable): User function. """ return self._client.submit(func, *args, **kwargs) def fetch(self, futures_, **kwargs): """ Fetch data of future objects. Args: futures_ (list): Future objects. """ return self._client.gather(futures_, **kwargs) def cancel(self, futures_, **kwargs): """ Cancel job of future objects. Args: futures_ (list): Future objects. """ return self._client.cancel(futures_, **kwargs) def close(self, *args, **kwargs): """ Close connection. """ return self._client.close(*args, **kwargs)
def qPCR_performance(self, deletions = 0, insertions = 0, substitutions = 0, fname = 'pyprimer_benchmark.feather', csv_fname = "pyprimer_summary.csv",): def generate_group_summary(group_df, group, col_list): v_stats = dict((key,[]) for key in col_list) for fversion in group_df["F Primer Version"].unique(): for rversion in group_df["R Primer Version"].unique(): for pversion in group_df["P Probe Version"].unique(): mean_ppc = group_df.loc[(group_df["F Primer Version"] == fversion) & (group_df["R Primer Version"] == rversion) & (group_df["P Probe Version"] == pversion), "PPC"].mean() seqs_matched = len(group_df.loc[(group_df["F Primer Version"] == fversion) & (group_df["R Primer Version"] == rversion) & (group_df["P Probe Version"] == pversion) & (group_df["Amplicon Sense Length"] != 0), "Amplicon Sense Length"]) n_seqs = group_df.loc[(group_df["F Primer Version"] == fversion) & (group_df["R Primer Version"] == rversion) & (group_df["P Probe Version"] == pversion), :].shape[0] v_stats["Primer Group"].append(group) v_stats["F Version"].append(fversion) v_stats["P Version"].append(pversion) v_stats["R Version"].append(rversion) v_stats["Mean PPC"].append(mean_ppc) try: percent_matched = (seqs_matched / n_seqs)*100 except: percent_matched = 0 v_stats["Sequences matched(%)"].append(percent_matched) group_stats = pd.DataFrame(v_stats) return group_stats def analyse(sequences_path, Fs, Rs, Ps, col_list, deletions, insertions, substitutions): res = [] with open(sequences_path, "r", newline='') as csvfile: seqreader = csv.reader(csvfile, delimiter = ',', quotechar ='"') for sequences in seqreader: if sequences[0] == "Header": pass else: for f in Fs: for r in Rs: for p in Ps: header = sequences[0] f_name = f[2] f_ver = f[5] p_ver = p[5] r_ver = r[5] f_res = TOOLS.match_fuzzily(f_ver, sequences[1], deletions, insertions, substitutions) r_res = TOOLS.match_fuzzily(r_ver, sequences[2], deletions, insertions, substitutions) if (f_res == None) or (r_res == None): start = None end = None amplicon = "" amplicon_length = 0 f_match = "" r_match = "" p_match = "" PPC = 0 else: Forwards = {} if type(f_res) == type(tuple()): Forwards[0] = (f_res[0], f_ver, 0) # (start, match, distance) else: for f_i in range(len(f_res)): Forwards[f_i] = (f_res[f_i].start, f_res[f_i].matched, f_res[f_i].dist) Reverses = {} if type(r_res) == type(tuple()): Reverses[0] = (r_res[0], r_ver, 0) else: for r_i in range(len(r_res)): Reverses[r_i] = (r_res[r_i].start, r_res[r_i].matched, r_res[r_i].dist) matches = {} for k_f, v_f in Forwards.items(): start = v_f[0] for k_r, v_r in Reverses.items(): end = (len(sequences[1]) - 1) - v_r[0] if end < start: matches[f"{k_f}:{k_r}"] = False amplicon = sequences[1][start:end] if len(amplicon) > 850: matches[f"{k_f}:{k_r}"] = False else: p_res = TOOLS.match_fuzzily(p_ver, amplicon, deletions, insertions, substitutions) if p_res == None: matches[f"{k_f}:{k_r}"] = False else: matches[f"{k_f}:{k_r}"] = True target_dist = np.Inf n_match = 0 for k, v in matches.items(): if v: n_match += 1 klist = k.split(":") k_f = int(klist[0]) k_r = int(klist[1]) f_good = Forwards[k_f] r_good = Reverses[k_r] mean_dist = (f_good[2] + r_good[2] + 1e-6)/2 # 1e-6 for smoothing if mean_dist < target_dist: target_dist = mean_dist start = f_good[0] f_match = f_good[1] end = (len(sequences[1]) - 1) - r_good[0] r_match = r_good[1] amplicon = sequences[1][start:end] amplicon_length = len(amplicon) if amplicon_length > 850: n_match -= 1 start = None end = None amplicon = "" amplicon_length = 0 f_match = "" r_match = "" PPC = 0 if n_match <= 0: start = None end = None amplicon = "" amplicon_length = 0 f_match = "" r_match = "" PPC = 0 else: PPC = TOOLS.calculate_PPC(F_primer=f_ver, F_match=f_match, R_primer=r_ver, R_match=r_match) res.append([f_name, f_ver, p_ver, r_ver, header, amplicon, amplicon_length, start, end, PPC]) res_df = pd.DataFrame(res, columns = col_list) del res return res_df self.fname = fname self.csv_fname = csv_fname self.deletions = deletions self.insertions = insertions self.substitutions = substitutions unique_groups = self.primers["ID"].unique() summary = pd.DataFrame(columns = self.SUMMARY_qPCR_COL_LIST) os.makedirs(self.savedir, exist_ok = True) print("Running Benchmark") cluster = LocalCluster(n_workers = self.nCores, threads_per_worker = 4, silence_logs=logging.ERROR) client = Client(cluster, timeout = 120) for group in tqdm(unique_groups): def help_analyse(x): return analyse(x, Fs, Rs, Ps, self.BENCHMARK_qPCR_COL_LIST, self.deletions, self.insertions, self.substitutions) Fs = self.primers.loc[(self.primers["ID"] == group) & (self.primers["Type"] == "F"),:].values Rs = self.primers.loc[(self.primers["ID"] == group) & (self.primers["Type"] == "R"),:].values Ps = self.primers.loc[(self.primers["ID"] == group) & (self.primers["Type"] == "P"),:].values print(f"Processing group {group}\n") futures = client.map(help_analyse, self.chunkpaths) progress(futures) result_chunks = client.gather(futures) group_df = pd.concat(result_chunks) group_df.reset_index(drop = True, inplace = True) print("\nPerformance computed, generating group summary\n") group_stats = generate_group_summary(group_df, group, self.SUMMARY_qPCR_COL_LIST) summary = summary.append(group_stats) client.cancel(futures) del group_stats del result_chunks print("Summary generated, saving group benchmark to Feather\n") group_df.to_feather(os.path.join(self.tmpdir, f"{group}_"+self.fname), compression = "uncompressed") print(f"Benchmark results saved to {os.path.join(self.tmpdir, group_+self.fname)}\n") del group_df summary.to_csv(os.path.join(self.savedir, self.csv_fname), index = False) print(f"Benchmark summary saved to {os.path.join(self.savedir, self.csv_fname)}\n")