def par_tsne(self, param_list, store_res=True, nprocs=1): """ Run t-SNE with multiple sets of parameters parallely. Parameters ---------- param_list: list of dict List of parameters being passed to t-SNE. nprocs: int Number of processes. Returns ------- tsne_res_list: list of float arrays List of t-SNE results of corresponding parameter set. Notes ----- Parallel running results cannot be stored during the run, because racing conditions may happen. """ nprocs = min(int(nprocs), len(param_list)) # single run tsne def srun_tsne(param_dict): return self.tsne(store_res=False, **param_dict) resl = utils.parmap(srun_tsne, param_list, nprocs) if store_res: for i in range(len(param_list)): self.put_tsne(str(param_list[i]), resl[i]) return resl
def detect_rare_samples(self, k, d_cutoff, n_iter, nprocs=1): """ KNN rare sample detection with multiple parameter combinations Assuming that there are at least k samples look similar in this dataset, the samples with less than k similar neighbors may be rare. The rare samples can either be really distinct from the general populaton or caused by technical errors. This procedure iteratively detects samples according to their k-th nearest neighbors. The samples most distinct from its k-th nearest neighbors are detected first. Then, the left samples are detected by less stringent distance cutoff. The distance cutoff decreases linearly from maximum distance to d_cutoff with n_iter iterations. Parameters ---------- k: int list or scalar K nearest neighbors to detect rare samples. d_cutoff: float list or scalar Samples with >= d_cutoff distances are distinct from each other. Minimum (>=) distance to be called as rare. n_iter: int list or scalar N progressive iNN detections on the dataset. nproces: int N processes to run all parameter tuples. Returns ------- res_list Indices of non-rare samples of each corresponding parameter tuple. Notes ----- If parameters are provided as lists of equal length n, the n corresponding parameter tuples will be executed parallely. Example: `k = [10, 15, 20]` `d_cutoff = [1, 2, 3]` `n_iter = [10, 20, 30]` `(k, d_cutoff, n_iter)` tuples `(10, 1, 10), (15, 2, 20), (20, 3, 30)` will be tried parallely with nprocs. """ # Convert scalar to list if np.isscalar(k): k_list = [k] else: k_list = list(k) if np.isscalar(d_cutoff): d_cutoff_list = [d_cutoff] else: d_cutoff_list = list(d_cutoff) if np.isscalar(n_iter): n_iter_list = [n_iter] else: n_iter_list = list(n_iter) # Check all param lists have the same length if not (len(k_list) == len(d_cutoff_list) == len(n_iter_list)): raise ValueError("Parameter should have the same length." "k: {}, d_cutoff: {}, n_iter: {}.".format( k, d_cutoff, n_iter)) n_param_tups = len(k_list) # type check all parameters for i in range(n_param_tups): if k_list[i] < 1 or k_list[i] > self._sdm._x.shape[0] - 1: raise ValueError("k should be >= 1 and <= n_samples-1. " "k: {}".format(k)) else: k_list[i] = int(k_list[i]) if d_cutoff_list[i] <= 0: raise ValueError("d_cutoff should be > 0. " "d_cutoff: {}".format(d_cutoff)) else: d_cutoff_list[i] = float(d_cutoff_list[i]) if n_iter_list[i] < 1: raise ValueError("n_iter should be >= 1. " "n_iter: {}".format(n_iter)) else: n_iter_list[i] = int(n_iter_list[i]) param_tups = [(k_list[i], d_cutoff_list[i], n_iter_list[i]) for i in range(n_param_tups)] nprocs = int(nprocs) nprocs = min(nprocs, n_param_tups) # returns (filtered_sdm, progress_list (list of kept indices)) res_list = utils.parmap( lambda ptup: self._rare_sample_detection_runner(*ptup), param_tups, nprocs) for i in range(n_param_tups): if param_tups[i] not in self._res_lut: self._res_lut[param_tups[i]] = res_list[i] return [res[0] for res in res_list]
def knn_pickup_features(self, k, n_do, min_present_val, n_iter, nprocs=1, statistic_fun=np.median): """ Runs KNN pick-up on multiple parameter sets parallely. Each parameter set will be executed in one process. Parameters ---------- k: int Look at k nearest neighbors to decide whether to pickup or not. n_do: int Minimum (`>=`) number of above min_present_val neighbors among KNN to be callsed as drop-out, so that pick-up will be performed. min_present_val: float Minimum (`>=`) values of a feature to be called as present. n_iter: int The number of iterations to run. statistic_fun: callable The summary statistic used to correct gene dropouts. Default is median. Returns ------- resl: list list of results, `[(pu_sdm, pu_idc_arr, stats), ...]`. pu_sdm: SampleDistanceMatrix SampleDistanceMatrix after pick-up pu_idc_arr: array of shape (n_samples, n_features) Indicator matrix of the ith iteration an entry is being picked up. stats: str Stats of the run. Notes ----- If parameters are provided as lists of equal length n, the n corresponding parameter tuples will be executed parallely. Example ------- If `k = [10, 15]`, `n_do = [1, 2]`, `min_present_val = [5, 6]`, and `n_iter = [10, 20]`, `(k, n_do, min_present_val, n_iter)` tuples `(10, 1, 5, 10) and (15, 2, 6, 20)` will be tried parallely with nprocs. n_do, min_present_val, n_iter """ try: # make sure that the function runs on list of numbers if not np.isscalar(np.isreal(statistic_fun([0, 1, 2]))): raise ValueError("statistic_fun should be a function of a" "list of numbers that returns a scalar.") except Exception: raise ValueError("statistic_fun should be a function of a" "list of numbers that returns a scalar.") if np.isscalar(k): k_list = [k] else: k_list = list(k) if np.isscalar(n_do): n_do_list = [n_do] else: n_do_list = list(n_do) if np.isscalar(min_present_val): min_present_val_list = [min_present_val] else: min_present_val_list = list(min_present_val) if np.isscalar(n_iter): n_iter_list = [n_iter] else: n_iter_list = list(n_iter) # Check all param lists have the same length if not (len(k_list) == len(n_do_list) == len(min_present_val_list) == len(n_iter_list)): raise ValueError("Parameter should have the same length." "k: {}, n_do: {}, min_present_val: {}, " "n_iter: {}.".format(k, n_do, min_present_val, n_iter)) n_param_tups = len(k_list) # type check all parameters for i in range(n_param_tups): if k_list[i] < 1 or k_list[i] >= self._sdm._x.shape[0]: raise ValueError("k should be >= 1 and < n_samples. " "k: {}".format(k)) else: k_list[i] = int(k_list[i]) if n_do_list[i] > k_list[i] or n_do_list[i] < 1: raise ValueError("n_do should be <= k and >= 1. " "n_do: {}".format(n_do)) else: n_do_list[i] = int(n_do_list[i]) min_present_val_list[i] = float(min_present_val_list[i]) if n_iter_list[i] < 1: raise ValueError("n_iter should be >= 1. " "n_iter: {}".format(n_iter)) else: n_iter_list[i] = int(n_iter_list[i]) param_tups = [(k_list[i], n_do_list[i], min_present_val_list[i], n_iter_list[i], statistic_fun) for i in range(n_param_tups)] res_list = [] # use cached results with the following procedure # 1. put cached results to res_list, with not cached ones as None # 2. run not cached ones # 3. after running, cache the results results and fill res_list # same as filter # TODO: abstract the running pattern into a function # parameter tuples without cached results for running run_param_tups = [] # indices of results to be filled after running res_list_run_inds = [] for i, ptup in enumerate(param_tups): if ptup in self._res_lut: res_list.append(self._res_lut[ptup]) else: run_param_tups.append(ptup) res_list.append(None) res_list_run_inds.append(i) # set up parameters for running # use gzipped pickle bytecode to save space, because python # multiprocessing has a limit of sharing memory through pipe gz_pb_x = gzip.compress(pickle.dumps(self._sdm._x)) run_param_setup_tups = [] for ptup in run_param_tups: # assumes that the first element of the ptup is k run_param_setup_tups.append((gz_pb_x, self._sdm.s_knn_ind_lut(ptup[0])) + ptup) nprocs = int(nprocs) nprocs = min(nprocs, n_param_tups) run_res_list = utils.parmap( lambda ptup: self._knn_pickup_features_runner(*ptup), run_param_setup_tups, nprocs) for i, param_tup in enumerate(run_param_tups): # cache results if param_tup in self._res_lut: raise NotImplementedError("Unexpected scenario encountered") res_x = pickle.loads(gzip.decompress(run_res_list[i][0])) res_idc = pickle.loads(gzip.decompress(run_res_list[i][1])) res_tup = (res_x, res_idc, run_res_list[i][2]) self._res_lut[param_tup] = res_tup # fill res_list if res_list[res_list_run_inds[i]] is not None: raise NotImplementedError("Unexpected scenario encountered") res_list[res_list_run_inds[i]] = res_tup kpu_sdm_list = [] for res in res_list: kpu_x = res[0] kpu_sdm = eda.SampleDistanceMatrix(kpu_x, metric=self._sdm._metric, sids=self._sdm.sids, fids=self._sdm.fids, nprocs=self._sdm._nprocs) kpu_sdm_list.append(kpu_sdm) return kpu_sdm_list
def detect_rare_samples(self, k, d_cutoff, n_iter, nprocs=1, metric=None, use_pca=False, use_hnsw=False, index_params=None, query_params=None): """ KNN rare sample detection with multiple parameter combinations Assuming that there are at least k samples look similar in this dataset, the samples with less than k similar neighbors may be rare. The rare samples can either be really distinct from the general populaton or caused by technical errors. This procedure iteratively detects samples according to their k-th nearest neighbors. The samples most distinct from its k-th nearest neighbors are detected first. Then, the left samples are detected by less stringent distance cutoff. The distance cutoff decreases linearly from maximum distance to d_cutoff with n_iter iterations. Parameters ---------- k: int list or scalar K nearest neighbors to detect rare samples. d_cutoff: float list or scalar Samples with >= d_cutoff distances are distinct from each other. Minimum (>=) distance to be called as rare. n_iter: int list or scalar N progressive iNN detections on the dataset. metric: {'cosine', 'euclidean', None} If none, self._sdm._metric is used. use_pca: bool Use PCA for nearest neighbors or not. use_hnsw: bool Use Hierarchical Navigable Small World graph to compute approximate nearest neighbor. index_params: dict Parameters used by HNSW in indexing. efConstruction: int Default 100. Higher value improves the quality of a constructed graph and leads to higher accuracy of search. However this also leads to longer indexing times. The reasonable range of values is 100-2000. M: int Default 5. Higher value leads to better recall and shorter retrieval times, at the expense of longer indexing time. The reasonable range of values is 5-100. delaunay_type: {0, 1, 2, 3} Default 2. Pruning heuristic, which affects the trade-off between retrieval performance and indexing time. The default is usually quite good. post: {0, 1, 2} Default 0. The amount and type of postprocessing applied to the constructed graph. 0 means no processing. 2 means more processing. indexThreadQty: int Default self._nprocs. The number of threads used. query_params: dict Parameters used by HNSW in querying. efSearch: int Default 100. Higher value improves recall at the expense of longer retrieval time. The reasonable range of values is 100-2000. nprocs: int N processes to run all parameter tuples. Returns ------- res_list Indices of non-rare samples of each corresponding parameter tuple. Notes ----- If parameters are provided as lists of equal length n, the n corresponding parameter tuples will be executed parallely. Example: `k = [10, 15, 20]` `d_cutoff = [1, 2, 3]` `n_iter = [10, 20, 30]` `(k, d_cutoff, n_iter)` tuples `(10, 1, 10), (15, 2, 20), (20, 3, 30)` will be tried parallely with nprocs. """ # Convert scalar to list if np.isscalar(k): k_list = [k] else: k_list = list(k) if np.isscalar(d_cutoff): d_cutoff_list = [d_cutoff] else: d_cutoff_list = list(d_cutoff) if np.isscalar(n_iter): n_iter_list = [n_iter] else: n_iter_list = list(n_iter) # Check all param lists have the same length if not (len(k_list) == len(d_cutoff_list) == len(n_iter_list)): raise ValueError("Parameter should have the same length." "k: {}, d_cutoff: {}, n_iter: {}.".format( k, d_cutoff, n_iter)) n_param_tups = len(k_list) # type check all parameters for i in range(n_param_tups): if k_list[i] < 1 or k_list[i] > self._sdm._x.shape[0] - 1: raise ValueError("k should be >= 1 and <= n_samples-1. " "k: {}".format(k)) else: k_list[i] = int(k_list[i]) if d_cutoff_list[i] <= 0: raise ValueError("d_cutoff should be > 0. " "d_cutoff: {}".format(d_cutoff)) else: d_cutoff_list[i] = float(d_cutoff_list[i]) if n_iter_list[i] < 1: raise ValueError("n_iter should be >= 1. " "n_iter: {}".format(n_iter)) else: n_iter_list[i] = int(n_iter_list[i]) param_tups = [(k_list[i], d_cutoff_list[i], n_iter_list[i], metric, use_pca, use_hnsw, index_params, query_params) for i in range(n_param_tups)] nprocs = int(nprocs) nprocs = min(nprocs, n_param_tups) # returns (filtered_sdm, progress_list (list of kept indices)) if self._sdm._use_pdist: res_list = utils.parmap( lambda ptup: self._pdist_rare_s_detect(*ptup), param_tups, nprocs) else: res_list = utils.parmap( lambda ptup: self._no_pdist_rare_s_detect(*ptup), param_tups, nprocs) for i in range(n_param_tups): # only use k, d, and n_iter for res saving param_key = param_tups[i][:3] if param_key not in self._res_lut: self._res_lut[param_key] = res_list[i] # print(res_list) return [res[0] for res in res_list]
def test_parmap_tup(): pm_res = utils.parmap(lambda x: x**2, (1, 2, 3)) assert isinstance(pm_res, list) assert pm_res == [1, 4, 9]
def test_parmap_exception_mp(): n = 1000 with pytest.warns(UserWarning, match='division by zero'): pm_res = utils.parmap(lambda x: x / 0, range(n), nprocs=10) assert all(map(lambda x: isinstance(x, ZeroDivisionError), pm_res))
def test_parmap_gen_mp(): n = 1000 pm_res = utils.parmap(lambda x: x**2, range(n), nprocs=10) assert isinstance(pm_res, list) assert pm_res == list(map(lambda x: x**2, range(n)))
def test_parmap_invalid_nprocs(): with pytest.raises(ValueError) as excinfo: pm_res = utils.parmap(lambda x: x**2, np.array([[1, 2], [3, 4]]), nprocs=0.5)
def test_parmap_arr2d(): pm_res = utils.parmap(lambda x: x**2, np.array([[1, 2], [3, 4]])) assert isinstance(pm_res, list) assert np.all(pm_res[0] == np.array([1, 4])) assert np.all(pm_res[1] == np.array([9, 16]))
def test_parmap_arr2d(): pm_res = utils.parmap(lambda x: x**2, np.array([1, 2, 3]).reshape(3, 1)) assert isinstance(pm_res, list) assert pm_res == [1, 4, 9]
def test_parmap_gen(): pm_res = utils.parmap(lambda x: x**2, range(1, 4)) assert isinstance(pm_res, list) assert pm_res == [1, 4, 9]
def test_parmap_lst(): pm_res = utils.parmap(lambda x: x**2, [1, 2, 3]) assert isinstance(pm_res, list) assert pm_res == [1, 4, 9]