def evaluate(self):
        dtags = set(
            self.dataset.partition_datasets("test").keys() +
            self.dataset.partition_datasets("train").keys())
        truncated_datasets = self.dataset.sample_loader.truncated_datasets
        res = max([
            d.data.summary.high_res
            for dtag, d in self.dataset.datasets.items()
        ])
        print(res)
        sample_loaders = {
            dtag: lambda d: self.dataset.sample_loader.get_sample(res, d)
            for dtag in dtags
        }
        gc.collect()
        results = jl.Parallel(n_jobs=int(self.cpus), verbose=10)(
            jl.delayed(self.evaluate_single)(
                sample_loaders[dtag], truncated_datasets[dtag],
                self.dataset.sample_loader.ref_map) for dtag in dtags)
        with worker_client() as client:
            results = client.map(
                self.evaluate_single,
                [(sample_loaders[dtag], truncated_datasets[dtag],
                  self.dataset.sample_loader.ref_map) for dtag in dtags])

        # self.statistical_maps = {dtag: res[0]
        #                          for dtag, res
        #                          in zip(dtags, results)}
        self.clusters = {dtag: res[1] for dtag, res in zip(dtags, results)}
        self.events = {dtag: res[2] for dtag, res in zip(dtags, results)}

        self.bdcs = {dtag: res[3] for dtag, res in zip(dtags, results)}

        return self
示例#2
0
    def go(self, client=None, serial=False):

        

        for key in self._required_names:
            assert self._required_params[key] is not None, f"you have not set {key}"

        assert self.z > 0, f"z: {self.z} must be greater than zero"
        assert self.duration > 0, f"duration: {self.duration} must be greater than zero"

        logger.debug(f"created a GRB with name: {self.name}")
        logger.debug(f"created a GRB with ra: {self.ra} and dec: {self.dec}")
        logger.debug(f"created a GRB with redshift: {self.z}")
        logger.debug(
            f"created a GRB with duration: {self.duration} and T0: {self.T0}")

        if not serial:

            if client is not None:

                futures = client.map(process_lightcurve,
                                     self._lightcurves.values())

                results = client.gather(futures)

            else:

                with worker_client() as client:

                    futures = client.map(
                        process_lightcurve, self._lightcurves.values())

                    results = client.gather(futures)

            del futures

        else:

            results = [process_lightcurve(lc)
                       for lc in self._lightcurves.values()]

        for lc in results:

            #            lc = future.result()

            self._lightcurves[lc.name].set_storage(lc)
示例#3
0
    def inner(*args, **kwargs):
        if use_dask:
            # dask version
            with dd.worker_client() as client:
                for try_n in range(n_tries):
                    fut = client.submit(func, *args, **kwargs)
                    try:
                        return fut.result(timeout=retry_freq)
                    except dd.TimeoutError:
                        ...
        else:
            # non-dask version
            def this_func(q):
                args = q.get_nowait()
                kwargs = q.get_nowait()
                out = func(*args, **kwargs)
                q.put(out)

            for try_n in range(n_tries):
                q = queue.Queue()
                p = threading.Thread(target=this_func, args=(q, ))
                q.put_nowait(args)
                q.put_nowait(kwargs)
                p.start()
                p.join(timeout=retry_freq)
                if p.is_alive():
                    del p, q
                    continue
                elif q.qsize() == 0:
                    raise RuntimeError(
                        "Queue is not empty. Something malfunctined in ``func``"
                    )
                return q.get()
        raise dd.TimeoutError(
            "Func did not complete successfully in allowed time/number of retries."
        )
示例#4
0
    def __call__(self, datasets, ref_map, map_resolution, grid):
        """Create map from miller arrays. Transform map into the reference frame by sampling at the given points."""

        assert ref_map.is_sparse(), 'Reference map is not in sparse form'

        # ==============================>
        # Create holder for the output map objects
        # ==============================>
        sample = {}
        # ==============================>
        # Return empty list if no datasets
        # ==============================>
        if not datasets: return sample

        # ==============================>
        # Load maps in parallel
        # ==============================>
        print('Loading maps (using {!s} cores)'.format(self.cpus))
        arg_list = [
            MapLoader(dataset=d, grid=grid, reference_map=ref_map, verbose=self.verbose,
                      map_resolution=map_resolution, resolution_factor=self.resolution_factor,
                      density_scaling=self.density_scaling)
            for dtag, d in datasets.items()]
        res = arg_list[0].run()

        # Print a sort of progress bar
        print('1' + ''.join(['{:<5}'.format(i) for i in range(0, len(arg_list) + 5, 5)])[2:])
        print(' ' * len(arg_list) + '|\r', end='')
        sys.stdout.flush()
        gc.collect()
        if self.multiprocessing == "dask":
            with worker_client(timeout=120, separate_thread=False) as client:
                dataset_maps_futures = client.map(wrapper_run, arg_list)
                dataset_maps = client.gather(dataset_maps_futures)

            # results = []
            # for arg in arg_list:
            #     y = dask.delayed(wrapper_run)(arg)
            #     results.append(y)
            # dataset_maps = dask.compute(results)
            # print(dask.distributed.get_worker())
            #
            # client = dask.distributed.get_client()
            # map_futures = client.map(wrapper_run, arg_list)
            # dask.distributed.secede()
            # dataset_maps = client.gather(map_futures)
            # dask.distributed.rejoin()

        else:
            dataset_maps = jl.Parallel(n_jobs=self.cpus,
                                       verbose=5)(jl.delayed(wrapper_run)(arg)
                                                                         for arg
                                                                     in arg_list)
        # ==============================>
        # Managed
        # ==============================>
        print('|')
        sample = {m.meta.tag: m
                  for m
                  in dataset_maps}
        # ==============================>
        # Clear fft map data to save memory
        # ==============================>
        # for dtag, m in sample.items():
        #     # TODO: is this the best way of handling this now?
        #     map_dataset = datasets[m.meta.tag]
        #     map_dataset.data.fft_maps['truncated'] = None

        return sample
示例#5
0
文件: fitting.py 项目: sajetan/PyHDX
def fit_rates_weighted_average(hdxm,
                               bounds=None,
                               chisq_thd=20,
                               model_type='association',
                               client=None,
                               pbar=None):
    """
    Fit a model specified by 'model_type' to D-uptake kinetics. D-uptake is weighted averaged across peptides per
    timepoint to obtain residue-level D-uptake.

    Parameters
    ----------
    hdxm : :class:`~pyhdx.models.HDXMeasurement`
    bounds : :obj:`tuple`, optional
        Tuple of lower and upper bounds of rate constants in the model used.
    chisq_thd : :obj:`float`
        Threshold of chi squared result, values above will trigger a second round of fitting using DifferentialEvolution
    model_type : :obj:`str`
        Missing docstring
    client : : ??
        Controls delegation of fitting tasks to Dask clusters. Options are: `None`: Do not use task, fitting is done
        in the local thread in a for loop. :class: Dask Client : Uses the supplied Dask client to schedule fitting task.
        `worker_client`: The function was ran by a Dask worker and the additional fitting tasks created are scheduled
        on the same Cluster.
    pbar:
        Not implemented

    Returns
    -------

    fit_result : :class:`~pyhdx.fitting.KineticsFitResult`

    """
    d_list, intervals, models = _prepare_wt_avg_fit(hdxm,
                                                    model_type=model_type,
                                                    bounds=bounds)
    if pbar:
        raise NotImplementedError()
    else:
        inc = lambda: None

    results = []

    if client is None:
        for d, model in zip(d_list, models):
            result = fit_kinetics(hdxm.timepoints,
                                  d,
                                  model,
                                  chisq_thd=chisq_thd)
            results.append(result)
    else:
        iterables = [[hdxm.timepoints] * len(d_list), d_list, models]

        if isinstance(client, Client):
            futures = client.map(fit_kinetics, *iterables, chisq_thd=chisq_thd)
            results = client.gather(futures)
        elif client == 'worker_client':
            with worker_client() as client:
                futures = client.map(fit_kinetics,
                                     *iterables,
                                     chisq_thd=chisq_thd)
                results = client.gather(futures)

    fit_result = KineticsFitResult(hdxm, intervals, results, models)

    return fit_result
def generate_neuroglancer_multires_mesh(output_path, num_workers, id, lods,
                                        original_ext, lod_0_box_size):
    """Dask delayed function to generate multiresolution mesh in neuroglancer
    mesh format using prewritten meshes at different levels of detail.

    This function generates the neuroglancer mesh for a single mesh, and
    parallelizes the mesh creation over `num_workers` by splitting the mesh in
    the x-direciton into `num_workers` fragments, each of which is sent to a
    a worker to be further subdivided.

    Args:
        output_path (`str`): Output path to writeout neuroglancer mesh
        num_workers (`int`): Number of workers for dask
        id (`int`): Mesh id
        lods (`list`): List of levels of detail
        original_ext (`str`): Original mesh file extension
        lod_0_box_size (`int`): Box size in lod 0 coordinates
    """

    with ExitStack() as stack:
        if num_workers > 1:
            # Worker client context really slows things down a lot, so only need to do it if we will actually parallelize
            client = stack.enter_context(worker_client())

        os.makedirs(f"{output_path}/multires", exist_ok=True)
        os.system(
            f"rm -rf {output_path}/multires/{id} {output_path}/multires/{id}.index"
        )

        results = []
        for idx, current_lod in enumerate(lods):
            if current_lod == 0:
                mesh_path = f"{output_path}/mesh_lods/s{current_lod}/{id}{original_ext}"
            else:
                mesh_path = f"{output_path}/mesh_lods/s{current_lod}/{id}.ply"

            vertices, _ = mesh_util.mesh_loader(mesh_path)

            if current_lod == 0:
                max_box_size = lod_0_box_size * 2**lods[-1]
                grid_origin = (vertices.min(axis=0) // max_box_size -
                               1) * max_box_size
            vertices -= grid_origin

            current_box_size = lod_0_box_size * 2**current_lod
            start_fragment = np.maximum(
                vertices.min(axis=0) // current_box_size - 1,
                np.array([0, 0, 0])).astype(int)
            end_fragment = (vertices.max(axis=0) // current_box_size +
                            1).astype(int)

            del vertices

            # Want to divide the mesh up into upto num_workers chunks. We do
            # that by first subdividing the largest dimension as much as
            # possible, followed by the next largest dimension etc so long
            # as we don't exceed num_workers slices. If we instead slice each
            # dimension once, before slicing any dimension twice etc, it would
            # increase the number of mesh slice operations we perform, which
            # seems slow.

            max_number_of_chunks = (end_fragment - start_fragment)
            dimensions_sorted = np.argsort(-max_number_of_chunks)
            num_chunks = np.array([1, 1, 1])

            for _ in range(num_workers + 1):
                for d in dimensions_sorted:
                    if num_chunks[d] < max_number_of_chunks[d]:
                        num_chunks[d] += 1
                        if np.prod(num_chunks) > num_workers:
                            num_chunks[d] -= 1
                        break

            stride = np.ceil(1.0 * (end_fragment - start_fragment) /
                             num_chunks).astype(np.int)

            # Scattering here, unless broadcast=True, causes this issue:
            # https://github.com/dask/distributed/issues/4612. But that is
            # slow so we are currently electing to read the meshes each time
            # within generate_mesh_decomposition.
            # vertices_to_send = client.scatter(vertices, broadcast=True)
            # faces_to_send = client.scatter(faces, broadcast=True)

            decomposition_results = []
            for x in range(start_fragment[0], end_fragment[0], stride[0]):
                for y in range(start_fragment[1], end_fragment[1], stride[1]):
                    for z in range(start_fragment[2], end_fragment[2],
                                   stride[2]):
                        current_start_fragment = np.array([x, y, z])
                        current_end_fragment = current_start_fragment + stride
                        if num_workers == 1:
                            # then we aren't parallelizing again
                            decomposition_results.append(
                                generate_mesh_decomposition(
                                    mesh_path, lod_0_box_size, grid_origin,
                                    current_start_fragment,
                                    current_end_fragment, current_lod,
                                    num_chunks))
                        else:
                            results.append(
                                dask.delayed(generate_mesh_decomposition)(
                                    mesh_path, lod_0_box_size, grid_origin,
                                    current_start_fragment,
                                    current_end_fragment, current_lod,
                                    num_chunks))

            if num_workers > 1:
                client.rebalance()
                decomposition_results = dask.compute(*results)

            results = []

            # Remove empty slabs
            decomposition_results = [
                fragments for fragments in decomposition_results if fragments
            ]

            fragments = [
                fragment for fragments in decomposition_results
                for fragment in fragments
            ]

            del decomposition_results

            mesh_util.write_mesh_files(
                f"{output_path}/multires", f"{id}", grid_origin, fragments,
                current_lod, lods[:idx + 1],
                np.asarray([lod_0_box_size, lod_0_box_size, lod_0_box_size]))

            del fragments
示例#7
0
    def run(self,
            data: np.ndarray,
            weights: np.ndarray,
            new_data: np.ndarray,
            coalition_depth: int = 1,
            num_workers: int = 1) -> np.ndarray:
        """
        Generates KernelSHAP values for data points in data using the KernelShap algorithm.
        This version of KernelShap distributes out the shapley value calculation over N cpus.
                N = min(max_cpus, number of cpus on your machine).

        Parameters
        ----------
        data : numpy.array
            Matrix of training data samples (# samples x # features). Can be original data
            or summarized data (output of running kmeans on original data)

        weights : numpy.array
            weights for each data point. Typically returned by running kmeans on the original data.
            If original data is being passed, use 1/(num of data points) as the weights

         new_data: numpy.array
            data for which shapley values should be computed (# samples x # features)

         coalition_depth : int
            coalition depth. This parameter controls number of coalitions considered
            during shapley value computation. Eg., if coalition_depth = 2, and number of features is
            4, then number of coalitions considered = C(4,1) + C(4,2) = 10.

         client : dask.distributed.Client
            DASK Client object

        Returns
        -------
        Matrix of shapley values for new_data points [new_data samples x num_features + 1]. The
        + 1 is because phi0 (no features present) is also returned
        """
        if (isinstance(data, np.ndarray) and isinstance(weights, np.ndarray)
                and isinstance(new_data, np.ndarray)):
            # assert len(X.shape) == 1 or len(X.shape) == 2, "Instance must have 1 or 2 dimensions!"
            # data = X.reshape((1, X.shape[0]))
            self.num_features = data.shape[1]
            coalitions = generate_coalitions(self.num_features,
                                             coalition_depth)
            # num_coalitions = len(self.coalitions)
            if len(new_data.shape) == 1:
                new_data = new_data.reshape(1, -1)

            fx = self.predictor(new_data)
            Ef = np.average(self.predictor(data), weights=weights)
            pi = self._generate_pi(coalitions)
            futures = []
            shap_vals = []

            # new_data_dask = da.from_array(new_data)
            # new_data_dask = new_data_dask.rechunk({0: 5, 1: None})
            # fx_dask = da.from_array(fx)
            # fx_data_dask = fx_dask.rechunk({0: 5})
            instance_chunks = np.array_split(new_data, num_workers, axis=0)
            fx_chunks = np.array_split(fx, num_workers, axis=0)
            num_splits = len(instance_chunks)
            # scatter common data across workers in advance
            # Ef_, data_, weights_, coalitions_, pi_ = client.scatter([Ef, data, weights, coalitions, pi], broadcast=True)
            with worker_client() as client:
                for i in range(0, num_splits):
                    print("Submitting:", i)
                    future = client.submit(self._shap, Ef, fx_chunks[i], data,
                                           weights, coalitions, pi,
                                           instance_chunks[i])
                    futures.append(future)

                # for future in as_completed(futures): This is sometimes more efficient but may result in out-of-order
                # computation of shap values, and hence requires implementation of re-ordering logic so the shap values
                # line up with the order of data instances.
                for future in futures:
                    result = future.result()
                    for s in future.result():
                        shap_vals.append(s)

            return np.array(shap_vals)
        else:
            raise ValueError('input variables must be np.ndarray')
    def calculate_statistical_maps(self,
                                   dataset_maps,
                                   uncertainties,
                                   map_data_size,
                                   mask_name=None,
                                   ignore_warnings=True,
                                   cpus=1):
        """Take the sampled maps and calculate statistics for each grid point across the datasets"""

        # Extract the maps to be used for averaging

        if len(dataset_maps) == 1:

            self._set_statistical_maps_from_array(template_map=self.mu,
                                                  map_array=numpy.zeros(
                                                      (map_data_size, 5)))

            return self.statistical_maps
        else:

            # Create statistics objects for each grid point
            if ignore_warnings:
                warnings.simplefilter('ignore', category=RuntimeWarning)

            # Extract the map uncertainties
            # uncertainties = [m.meta.map_uncertainty for m in dataset_maps]
            assert uncertainties.count(
                None) == 0, 'some maps have not got associated uncertainties'

            # Chunk the points into groups - Compromise between cpu time and memory usage - 1000 per cpu at 50 datasets
            chunk_size = iceil(1000.0 * cpus * 50.0 / len(dataset_maps))
            chunk_idxs = [i for i in range(0, map_data_size, chunk_size)]
            num_chunks = len(chunk_idxs)

            # Second level of iteration - split the first chunk level between the cpus
            chunk_size_2 = iceil(1.0 * chunk_size / cpus)
            chunk_idxs_2 = [i for i in range(0, chunk_size, chunk_size_2)]
            num_chunks_2 = len(chunk_idxs_2)

            t1 = time.time()

            # Output array of the 5 statistics for each map point
            point_statistics = numpy.zeros((map_data_size, 5))

            tot = 0
            for i_chunk, chunk_start in enumerate(chunk_idxs):
                status_bar_2(n=i_chunk, n_max=num_chunks)

                # Argument list for multiprocessing
                arg_list = []

                # Loop through the secondary chunks and send for multi-core processing
                for i_chunk_2, chunk_start_2 in enumerate(chunk_idxs_2):

                    # Lower limit - always the beginning of the chunk
                    l1 = chunk_start + chunk_start_2
                    # Upper limit - full chunk size, limited by the larger chunk size, or by map size
                    l2 = min(chunk_start + chunk_start_2 + chunk_size_2,
                             chunk_start + chunk_size, map_data_size)

                    if l1 >= l2:
                        continue

                    # Extract map values from the maps
                    map_vals = [m.data[l1:l2] for m in dataset_maps]
                    # Want to iterate over grid points not datasets
                    map_vals = numpy.transpose(map_vals)
                    assert map_vals.shape[1] == len(dataset_maps)

                    # Create DensityStatistics object for analysis of the density variation
                    arg_list.append(
                        DensityStatistics(observations_array=map_vals,
                                          uncertainties=uncertainties))

                if not arg_list: continue

                # Calculate the statistis of the grid points
                # TODO: use joblib instead
                # tmp_point_statistics = easy_mp.pool_map(func=wrapper_run, args=arg_list, processes=cpus)
                # tmp_point_statistics = jl.Parallel(n_jobs=self.cpus)(jl.delayed(wrapper_run)(arg)
                #                                                      for arg
                #                                                      in arg_list)
                with worker_client() as client:
                    tmp_point_statistics_futures = client.map(
                        wrapper_run, arg_list)
                    tmp_point_statistics = client.gather(
                        tmp_point_statistics_futures)

                # Put values into the output array
                offset = 0
                for point_vals in tmp_point_statistics:
                    assert point_vals.shape[1] == 5
                    l1 = chunk_start + offset
                    l2 = l1 + point_vals.shape[0]
                    if not (point_statistics[l1:l2, :] == 0.0).all():
                        print('Overwriting data?!')
                        print(point_statistics[l1 - 10:l2 + 10, :])
                        assert point_statistics[
                            l1:
                            l2, :].shape == point_vals.shape, '{} != {}'.format(
                                point_statistics[l1:l2, :].shape,
                                point_vals.shape)
                    point_statistics[l1:l2, :] = point_vals
                    offset += point_vals.shape[0]
                tot += offset

            status_bar_2(n=num_chunks, n_max=num_chunks)

            # Check that we've calculated the right number of things
            assert tot == map_data_size, 'tot {}, map size {}'.format(
                tot, map_data_size)

            t2 = time.time()

        self._set_statistical_maps_from_array(template_map=self.mu,
                                              map_array=point_statistics,
                                              map_data_size=map_data_size)
    def fit_sigma_uncertainty(self,
                              analysis_maps,
                              map_data_size,
                              masked_idxs=None,
                              mask_name=None,
                              q_cut=1.5,
                              cpus=1):
        """Calculate the uncertainty in each of the different maps"""

        print("\t### Fitting sigma_uncertainty!")

        if masked_idxs is None:
            masked_idxs = flex.size_t(range(0, map_data_size))
        else:
            assert max(
                masked_idxs) < map_data_size, 'masked_idxs out of range of map'
            masked_idxs = flex.size_t(masked_idxs)

        # Extract masked map values from the average map... and sort them
        comp_vals = self.mu.data.select(masked_idxs)

        arg_list = []

        # for i_m, m in enumerate(self.dataset_maps.mask(mask_name=mask_name)):
        for i_m, m in enumerate(analysis_maps):

            if m.meta.map_uncertainty is not None:
                arg_list.append(None)
                continue

            u = UncertaintyCalculator(query_values=m.data.select(masked_idxs),
                                      ref_values=comp_vals)
            arg_list.append(u)

        t1 = time.time()
        num_to_process = len(arg_list) - arg_list.count(None)
        print('1' + ''.join(
            ['{:<5}'.format(i) for i in range(0, num_to_process + 5, 5)])[2:])
        print(' ' * num_to_process + '|\r', end='')
        sys.stdout.flush()
        # TODO: use joblib instead
        # map_uncertainties = easy_mp.pool_map(func=wrapper_run, args=arg_list, processes=cpus, chunksize=1)
        # map_uncertainties = jl.Parallel(n_jobs=self.cpus,
        #                                 verbose=5)(jl.delayed(wrapper_run)(arg)
        #                                             for arg
        #                                             in arg_list)
        with worker_client() as client:
            map_uncertainties_futures = client.map(wrapper_run, arg_list)
            map_uncertainties = client.gather(map_uncertainties_futures)
        print('|')

        for i_m, m in enumerate(analysis_maps):

            map_unc = map_uncertainties[i_m]

            if m.meta.map_uncertainty is not None:
                assert map_unc is None
            else:
                # TODO: remove this print
                # print("Adding map uncertainty for {}".format(m.meta.tag))
                assert map_unc is not None
                m.meta.map_uncertainty = map_unc
                # TODO: Not sure why this is breaking - probably to do with futures print

        # return [m.meta.map_uncertainty for m in self.dataset_maps.mask(mask_name=mask_name)]
        return {m.meta.tag: m.meta.map_uncertainty for m in analysis_maps}