def evaluate(self): dtags = set( self.dataset.partition_datasets("test").keys() + self.dataset.partition_datasets("train").keys()) truncated_datasets = self.dataset.sample_loader.truncated_datasets res = max([ d.data.summary.high_res for dtag, d in self.dataset.datasets.items() ]) print(res) sample_loaders = { dtag: lambda d: self.dataset.sample_loader.get_sample(res, d) for dtag in dtags } gc.collect() results = jl.Parallel(n_jobs=int(self.cpus), verbose=10)( jl.delayed(self.evaluate_single)( sample_loaders[dtag], truncated_datasets[dtag], self.dataset.sample_loader.ref_map) for dtag in dtags) with worker_client() as client: results = client.map( self.evaluate_single, [(sample_loaders[dtag], truncated_datasets[dtag], self.dataset.sample_loader.ref_map) for dtag in dtags]) # self.statistical_maps = {dtag: res[0] # for dtag, res # in zip(dtags, results)} self.clusters = {dtag: res[1] for dtag, res in zip(dtags, results)} self.events = {dtag: res[2] for dtag, res in zip(dtags, results)} self.bdcs = {dtag: res[3] for dtag, res in zip(dtags, results)} return self
def go(self, client=None, serial=False): for key in self._required_names: assert self._required_params[key] is not None, f"you have not set {key}" assert self.z > 0, f"z: {self.z} must be greater than zero" assert self.duration > 0, f"duration: {self.duration} must be greater than zero" logger.debug(f"created a GRB with name: {self.name}") logger.debug(f"created a GRB with ra: {self.ra} and dec: {self.dec}") logger.debug(f"created a GRB with redshift: {self.z}") logger.debug( f"created a GRB with duration: {self.duration} and T0: {self.T0}") if not serial: if client is not None: futures = client.map(process_lightcurve, self._lightcurves.values()) results = client.gather(futures) else: with worker_client() as client: futures = client.map( process_lightcurve, self._lightcurves.values()) results = client.gather(futures) del futures else: results = [process_lightcurve(lc) for lc in self._lightcurves.values()] for lc in results: # lc = future.result() self._lightcurves[lc.name].set_storage(lc)
def inner(*args, **kwargs): if use_dask: # dask version with dd.worker_client() as client: for try_n in range(n_tries): fut = client.submit(func, *args, **kwargs) try: return fut.result(timeout=retry_freq) except dd.TimeoutError: ... else: # non-dask version def this_func(q): args = q.get_nowait() kwargs = q.get_nowait() out = func(*args, **kwargs) q.put(out) for try_n in range(n_tries): q = queue.Queue() p = threading.Thread(target=this_func, args=(q, )) q.put_nowait(args) q.put_nowait(kwargs) p.start() p.join(timeout=retry_freq) if p.is_alive(): del p, q continue elif q.qsize() == 0: raise RuntimeError( "Queue is not empty. Something malfunctined in ``func``" ) return q.get() raise dd.TimeoutError( "Func did not complete successfully in allowed time/number of retries." )
def __call__(self, datasets, ref_map, map_resolution, grid): """Create map from miller arrays. Transform map into the reference frame by sampling at the given points.""" assert ref_map.is_sparse(), 'Reference map is not in sparse form' # ==============================> # Create holder for the output map objects # ==============================> sample = {} # ==============================> # Return empty list if no datasets # ==============================> if not datasets: return sample # ==============================> # Load maps in parallel # ==============================> print('Loading maps (using {!s} cores)'.format(self.cpus)) arg_list = [ MapLoader(dataset=d, grid=grid, reference_map=ref_map, verbose=self.verbose, map_resolution=map_resolution, resolution_factor=self.resolution_factor, density_scaling=self.density_scaling) for dtag, d in datasets.items()] res = arg_list[0].run() # Print a sort of progress bar print('1' + ''.join(['{:<5}'.format(i) for i in range(0, len(arg_list) + 5, 5)])[2:]) print(' ' * len(arg_list) + '|\r', end='') sys.stdout.flush() gc.collect() if self.multiprocessing == "dask": with worker_client(timeout=120, separate_thread=False) as client: dataset_maps_futures = client.map(wrapper_run, arg_list) dataset_maps = client.gather(dataset_maps_futures) # results = [] # for arg in arg_list: # y = dask.delayed(wrapper_run)(arg) # results.append(y) # dataset_maps = dask.compute(results) # print(dask.distributed.get_worker()) # # client = dask.distributed.get_client() # map_futures = client.map(wrapper_run, arg_list) # dask.distributed.secede() # dataset_maps = client.gather(map_futures) # dask.distributed.rejoin() else: dataset_maps = jl.Parallel(n_jobs=self.cpus, verbose=5)(jl.delayed(wrapper_run)(arg) for arg in arg_list) # ==============================> # Managed # ==============================> print('|') sample = {m.meta.tag: m for m in dataset_maps} # ==============================> # Clear fft map data to save memory # ==============================> # for dtag, m in sample.items(): # # TODO: is this the best way of handling this now? # map_dataset = datasets[m.meta.tag] # map_dataset.data.fft_maps['truncated'] = None return sample
def fit_rates_weighted_average(hdxm, bounds=None, chisq_thd=20, model_type='association', client=None, pbar=None): """ Fit a model specified by 'model_type' to D-uptake kinetics. D-uptake is weighted averaged across peptides per timepoint to obtain residue-level D-uptake. Parameters ---------- hdxm : :class:`~pyhdx.models.HDXMeasurement` bounds : :obj:`tuple`, optional Tuple of lower and upper bounds of rate constants in the model used. chisq_thd : :obj:`float` Threshold of chi squared result, values above will trigger a second round of fitting using DifferentialEvolution model_type : :obj:`str` Missing docstring client : : ?? Controls delegation of fitting tasks to Dask clusters. Options are: `None`: Do not use task, fitting is done in the local thread in a for loop. :class: Dask Client : Uses the supplied Dask client to schedule fitting task. `worker_client`: The function was ran by a Dask worker and the additional fitting tasks created are scheduled on the same Cluster. pbar: Not implemented Returns ------- fit_result : :class:`~pyhdx.fitting.KineticsFitResult` """ d_list, intervals, models = _prepare_wt_avg_fit(hdxm, model_type=model_type, bounds=bounds) if pbar: raise NotImplementedError() else: inc = lambda: None results = [] if client is None: for d, model in zip(d_list, models): result = fit_kinetics(hdxm.timepoints, d, model, chisq_thd=chisq_thd) results.append(result) else: iterables = [[hdxm.timepoints] * len(d_list), d_list, models] if isinstance(client, Client): futures = client.map(fit_kinetics, *iterables, chisq_thd=chisq_thd) results = client.gather(futures) elif client == 'worker_client': with worker_client() as client: futures = client.map(fit_kinetics, *iterables, chisq_thd=chisq_thd) results = client.gather(futures) fit_result = KineticsFitResult(hdxm, intervals, results, models) return fit_result
def generate_neuroglancer_multires_mesh(output_path, num_workers, id, lods, original_ext, lod_0_box_size): """Dask delayed function to generate multiresolution mesh in neuroglancer mesh format using prewritten meshes at different levels of detail. This function generates the neuroglancer mesh for a single mesh, and parallelizes the mesh creation over `num_workers` by splitting the mesh in the x-direciton into `num_workers` fragments, each of which is sent to a a worker to be further subdivided. Args: output_path (`str`): Output path to writeout neuroglancer mesh num_workers (`int`): Number of workers for dask id (`int`): Mesh id lods (`list`): List of levels of detail original_ext (`str`): Original mesh file extension lod_0_box_size (`int`): Box size in lod 0 coordinates """ with ExitStack() as stack: if num_workers > 1: # Worker client context really slows things down a lot, so only need to do it if we will actually parallelize client = stack.enter_context(worker_client()) os.makedirs(f"{output_path}/multires", exist_ok=True) os.system( f"rm -rf {output_path}/multires/{id} {output_path}/multires/{id}.index" ) results = [] for idx, current_lod in enumerate(lods): if current_lod == 0: mesh_path = f"{output_path}/mesh_lods/s{current_lod}/{id}{original_ext}" else: mesh_path = f"{output_path}/mesh_lods/s{current_lod}/{id}.ply" vertices, _ = mesh_util.mesh_loader(mesh_path) if current_lod == 0: max_box_size = lod_0_box_size * 2**lods[-1] grid_origin = (vertices.min(axis=0) // max_box_size - 1) * max_box_size vertices -= grid_origin current_box_size = lod_0_box_size * 2**current_lod start_fragment = np.maximum( vertices.min(axis=0) // current_box_size - 1, np.array([0, 0, 0])).astype(int) end_fragment = (vertices.max(axis=0) // current_box_size + 1).astype(int) del vertices # Want to divide the mesh up into upto num_workers chunks. We do # that by first subdividing the largest dimension as much as # possible, followed by the next largest dimension etc so long # as we don't exceed num_workers slices. If we instead slice each # dimension once, before slicing any dimension twice etc, it would # increase the number of mesh slice operations we perform, which # seems slow. max_number_of_chunks = (end_fragment - start_fragment) dimensions_sorted = np.argsort(-max_number_of_chunks) num_chunks = np.array([1, 1, 1]) for _ in range(num_workers + 1): for d in dimensions_sorted: if num_chunks[d] < max_number_of_chunks[d]: num_chunks[d] += 1 if np.prod(num_chunks) > num_workers: num_chunks[d] -= 1 break stride = np.ceil(1.0 * (end_fragment - start_fragment) / num_chunks).astype(np.int) # Scattering here, unless broadcast=True, causes this issue: # https://github.com/dask/distributed/issues/4612. But that is # slow so we are currently electing to read the meshes each time # within generate_mesh_decomposition. # vertices_to_send = client.scatter(vertices, broadcast=True) # faces_to_send = client.scatter(faces, broadcast=True) decomposition_results = [] for x in range(start_fragment[0], end_fragment[0], stride[0]): for y in range(start_fragment[1], end_fragment[1], stride[1]): for z in range(start_fragment[2], end_fragment[2], stride[2]): current_start_fragment = np.array([x, y, z]) current_end_fragment = current_start_fragment + stride if num_workers == 1: # then we aren't parallelizing again decomposition_results.append( generate_mesh_decomposition( mesh_path, lod_0_box_size, grid_origin, current_start_fragment, current_end_fragment, current_lod, num_chunks)) else: results.append( dask.delayed(generate_mesh_decomposition)( mesh_path, lod_0_box_size, grid_origin, current_start_fragment, current_end_fragment, current_lod, num_chunks)) if num_workers > 1: client.rebalance() decomposition_results = dask.compute(*results) results = [] # Remove empty slabs decomposition_results = [ fragments for fragments in decomposition_results if fragments ] fragments = [ fragment for fragments in decomposition_results for fragment in fragments ] del decomposition_results mesh_util.write_mesh_files( f"{output_path}/multires", f"{id}", grid_origin, fragments, current_lod, lods[:idx + 1], np.asarray([lod_0_box_size, lod_0_box_size, lod_0_box_size])) del fragments
def run(self, data: np.ndarray, weights: np.ndarray, new_data: np.ndarray, coalition_depth: int = 1, num_workers: int = 1) -> np.ndarray: """ Generates KernelSHAP values for data points in data using the KernelShap algorithm. This version of KernelShap distributes out the shapley value calculation over N cpus. N = min(max_cpus, number of cpus on your machine). Parameters ---------- data : numpy.array Matrix of training data samples (# samples x # features). Can be original data or summarized data (output of running kmeans on original data) weights : numpy.array weights for each data point. Typically returned by running kmeans on the original data. If original data is being passed, use 1/(num of data points) as the weights new_data: numpy.array data for which shapley values should be computed (# samples x # features) coalition_depth : int coalition depth. This parameter controls number of coalitions considered during shapley value computation. Eg., if coalition_depth = 2, and number of features is 4, then number of coalitions considered = C(4,1) + C(4,2) = 10. client : dask.distributed.Client DASK Client object Returns ------- Matrix of shapley values for new_data points [new_data samples x num_features + 1]. The + 1 is because phi0 (no features present) is also returned """ if (isinstance(data, np.ndarray) and isinstance(weights, np.ndarray) and isinstance(new_data, np.ndarray)): # assert len(X.shape) == 1 or len(X.shape) == 2, "Instance must have 1 or 2 dimensions!" # data = X.reshape((1, X.shape[0])) self.num_features = data.shape[1] coalitions = generate_coalitions(self.num_features, coalition_depth) # num_coalitions = len(self.coalitions) if len(new_data.shape) == 1: new_data = new_data.reshape(1, -1) fx = self.predictor(new_data) Ef = np.average(self.predictor(data), weights=weights) pi = self._generate_pi(coalitions) futures = [] shap_vals = [] # new_data_dask = da.from_array(new_data) # new_data_dask = new_data_dask.rechunk({0: 5, 1: None}) # fx_dask = da.from_array(fx) # fx_data_dask = fx_dask.rechunk({0: 5}) instance_chunks = np.array_split(new_data, num_workers, axis=0) fx_chunks = np.array_split(fx, num_workers, axis=0) num_splits = len(instance_chunks) # scatter common data across workers in advance # Ef_, data_, weights_, coalitions_, pi_ = client.scatter([Ef, data, weights, coalitions, pi], broadcast=True) with worker_client() as client: for i in range(0, num_splits): print("Submitting:", i) future = client.submit(self._shap, Ef, fx_chunks[i], data, weights, coalitions, pi, instance_chunks[i]) futures.append(future) # for future in as_completed(futures): This is sometimes more efficient but may result in out-of-order # computation of shap values, and hence requires implementation of re-ordering logic so the shap values # line up with the order of data instances. for future in futures: result = future.result() for s in future.result(): shap_vals.append(s) return np.array(shap_vals) else: raise ValueError('input variables must be np.ndarray')
def calculate_statistical_maps(self, dataset_maps, uncertainties, map_data_size, mask_name=None, ignore_warnings=True, cpus=1): """Take the sampled maps and calculate statistics for each grid point across the datasets""" # Extract the maps to be used for averaging if len(dataset_maps) == 1: self._set_statistical_maps_from_array(template_map=self.mu, map_array=numpy.zeros( (map_data_size, 5))) return self.statistical_maps else: # Create statistics objects for each grid point if ignore_warnings: warnings.simplefilter('ignore', category=RuntimeWarning) # Extract the map uncertainties # uncertainties = [m.meta.map_uncertainty for m in dataset_maps] assert uncertainties.count( None) == 0, 'some maps have not got associated uncertainties' # Chunk the points into groups - Compromise between cpu time and memory usage - 1000 per cpu at 50 datasets chunk_size = iceil(1000.0 * cpus * 50.0 / len(dataset_maps)) chunk_idxs = [i for i in range(0, map_data_size, chunk_size)] num_chunks = len(chunk_idxs) # Second level of iteration - split the first chunk level between the cpus chunk_size_2 = iceil(1.0 * chunk_size / cpus) chunk_idxs_2 = [i for i in range(0, chunk_size, chunk_size_2)] num_chunks_2 = len(chunk_idxs_2) t1 = time.time() # Output array of the 5 statistics for each map point point_statistics = numpy.zeros((map_data_size, 5)) tot = 0 for i_chunk, chunk_start in enumerate(chunk_idxs): status_bar_2(n=i_chunk, n_max=num_chunks) # Argument list for multiprocessing arg_list = [] # Loop through the secondary chunks and send for multi-core processing for i_chunk_2, chunk_start_2 in enumerate(chunk_idxs_2): # Lower limit - always the beginning of the chunk l1 = chunk_start + chunk_start_2 # Upper limit - full chunk size, limited by the larger chunk size, or by map size l2 = min(chunk_start + chunk_start_2 + chunk_size_2, chunk_start + chunk_size, map_data_size) if l1 >= l2: continue # Extract map values from the maps map_vals = [m.data[l1:l2] for m in dataset_maps] # Want to iterate over grid points not datasets map_vals = numpy.transpose(map_vals) assert map_vals.shape[1] == len(dataset_maps) # Create DensityStatistics object for analysis of the density variation arg_list.append( DensityStatistics(observations_array=map_vals, uncertainties=uncertainties)) if not arg_list: continue # Calculate the statistis of the grid points # TODO: use joblib instead # tmp_point_statistics = easy_mp.pool_map(func=wrapper_run, args=arg_list, processes=cpus) # tmp_point_statistics = jl.Parallel(n_jobs=self.cpus)(jl.delayed(wrapper_run)(arg) # for arg # in arg_list) with worker_client() as client: tmp_point_statistics_futures = client.map( wrapper_run, arg_list) tmp_point_statistics = client.gather( tmp_point_statistics_futures) # Put values into the output array offset = 0 for point_vals in tmp_point_statistics: assert point_vals.shape[1] == 5 l1 = chunk_start + offset l2 = l1 + point_vals.shape[0] if not (point_statistics[l1:l2, :] == 0.0).all(): print('Overwriting data?!') print(point_statistics[l1 - 10:l2 + 10, :]) assert point_statistics[ l1: l2, :].shape == point_vals.shape, '{} != {}'.format( point_statistics[l1:l2, :].shape, point_vals.shape) point_statistics[l1:l2, :] = point_vals offset += point_vals.shape[0] tot += offset status_bar_2(n=num_chunks, n_max=num_chunks) # Check that we've calculated the right number of things assert tot == map_data_size, 'tot {}, map size {}'.format( tot, map_data_size) t2 = time.time() self._set_statistical_maps_from_array(template_map=self.mu, map_array=point_statistics, map_data_size=map_data_size)
def fit_sigma_uncertainty(self, analysis_maps, map_data_size, masked_idxs=None, mask_name=None, q_cut=1.5, cpus=1): """Calculate the uncertainty in each of the different maps""" print("\t### Fitting sigma_uncertainty!") if masked_idxs is None: masked_idxs = flex.size_t(range(0, map_data_size)) else: assert max( masked_idxs) < map_data_size, 'masked_idxs out of range of map' masked_idxs = flex.size_t(masked_idxs) # Extract masked map values from the average map... and sort them comp_vals = self.mu.data.select(masked_idxs) arg_list = [] # for i_m, m in enumerate(self.dataset_maps.mask(mask_name=mask_name)): for i_m, m in enumerate(analysis_maps): if m.meta.map_uncertainty is not None: arg_list.append(None) continue u = UncertaintyCalculator(query_values=m.data.select(masked_idxs), ref_values=comp_vals) arg_list.append(u) t1 = time.time() num_to_process = len(arg_list) - arg_list.count(None) print('1' + ''.join( ['{:<5}'.format(i) for i in range(0, num_to_process + 5, 5)])[2:]) print(' ' * num_to_process + '|\r', end='') sys.stdout.flush() # TODO: use joblib instead # map_uncertainties = easy_mp.pool_map(func=wrapper_run, args=arg_list, processes=cpus, chunksize=1) # map_uncertainties = jl.Parallel(n_jobs=self.cpus, # verbose=5)(jl.delayed(wrapper_run)(arg) # for arg # in arg_list) with worker_client() as client: map_uncertainties_futures = client.map(wrapper_run, arg_list) map_uncertainties = client.gather(map_uncertainties_futures) print('|') for i_m, m in enumerate(analysis_maps): map_unc = map_uncertainties[i_m] if m.meta.map_uncertainty is not None: assert map_unc is None else: # TODO: remove this print # print("Adding map uncertainty for {}".format(m.meta.tag)) assert map_unc is not None m.meta.map_uncertainty = map_unc # TODO: Not sure why this is breaking - probably to do with futures print # return [m.meta.map_uncertainty for m in self.dataset_maps.mask(mask_name=mask_name)] return {m.meta.tag: m.meta.map_uncertainty for m in analysis_maps}