def test_pairwise_distances_unsuppored_metrics(metric): rng = np.random.RandomState(3) X = rng.random_sample((5, 4)) with pytest.raises(ValueError): pairwise_distances(X, metric=metric)
def test_pairwise_distances_sklearn_comparison(metric: str, matrix_size): # Test larger sizes to sklearn rng = np.random.RandomState(1) element_count = matrix_size[0] * matrix_size[1] X = rng.random_sample(matrix_size) Y = rng.random_sample(matrix_size) # For fp64, compare at 10 decimals, (5 places less than the ~15 max) compare_precision = 10 # Compare to sklearn, fp64 S = pairwise_distances(X, Y, metric=metric) if (element_count <= 2000000): S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # For fp32, compare at 4 decimals, (3 places less than the ~7 max) compare_precision = 4 X = np.asfarray(X, dtype=np.float32) Y = np.asfarray(Y, dtype=np.float32) # Compare to sklearn, fp32 S = pairwise_distances(X, Y, metric=metric) if (element_count <= 2000000): S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
def mmr( doc_embedding, word_embeddings, words, top_n=5, diversity=0.8, ): """ Calculate Maximal Marginal Relevance (MMR) between candidate keywords and the document. MMR considers the similarity of keywords/keyphrases with the document, along with the similarity of already selected keywords and keyphrases. This results in a selection of keywords that maximize their within diversity with respect to the document. Arguments: doc_embedding: The document embeddings word_embeddings: The embeddings of the selected candidate keywords/phrases words: The selected candidate keywords/keyphrases top_n: The number of keywords/keyhprases to return diversity: How diverse the select keywords/keyphrases are. Values between 0 and 1 with 0 being not diverse at all and 1 being most diverse. Returns: List[str]: The selected keywords/keyphrases """ # Extract similarity within words, and between words and the document word_doc_similarity = 1 - pairwise_distances( word_embeddings, doc_embedding, metric="cosine") word_similarity = 1 - pairwise_distances(word_embeddings, metric="cosine") # Initialize candidates and already choose best keyword/keyphras keywords_idx = cp.argmax(word_doc_similarity) target = cp.take(keywords_idx, 0) candidates_idx = [i for i in range(len(words)) if i != target] for i in range(top_n - 1): candidate_similarities = word_doc_similarity[candidates_idx, :] if i == 0: first_row = cp.reshape( word_similarity[candidates_idx][:, keywords_idx], (word_similarity[candidates_idx][:, keywords_idx].shape[0], 1)) target_similarities = cp.max(first_row, axis=1) else: target_similarities = cp.max( word_similarity[candidates_idx][:, keywords_idx], axis=1) # Calculate MMR mmr = ( 1 - diversity ) * candidate_similarities - diversity * target_similarities.reshape( -1, 1) mmr_idx = cp.take(cp.array(candidates_idx), cp.argmax(mmr)) # Update keywords & candidates keywords_idx = cp.append(keywords_idx, mmr_idx) candidates_idx.remove(mmr_idx) return [words[idx] for idx in keywords_idx.get()]
def test_pairwise_distances(metric: str, matrix_size, is_col_major): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) def prep_array(array): return np.asfortranarray(array) if is_col_major else array # For fp64, compare at 13 decimals, (2 places less than the ~15 max) compare_precision = 10 # Compare to sklearn, single input X = prep_array(rng.random_sample(matrix_size)) S = pairwise_distances(X, metric=metric) S2 = sklearn_pairwise_distances(X, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, double input with same dimensions Y = X S = pairwise_distances(X, Y, metric=metric) S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare single and double inputs to eachother S = pairwise_distances(X, metric=metric) S2 = pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, with Y dim != X dim Y = prep_array(rng.random_sample((2, matrix_size[1]))) S = pairwise_distances(X, Y, metric=metric) S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Change precision of one parameter Y = np.asfarray(Y, dtype=np.float32) S = pairwise_distances(X, Y, metric=metric) S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # For fp32, compare at 5 decimals, (2 places less than the ~7 max) compare_precision = 2 # Change precision of both parameters to float X = np.asfarray(X, dtype=np.float32) Y = np.asfarray(Y, dtype=np.float32) S = pairwise_distances(X, Y, metric=metric) S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Test sending an int type with convert_dtype=True Y = prep_array(rng.randint(10, size=Y.shape)) S = pairwise_distances(X, Y, metric=metric, convert_dtype=True) S2 = sklearn_pairwise_distances(X, Y, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Test that uppercase on the metric name throws an error. with pytest.raises(ValueError): pairwise_distances(X, Y, metric=metric.capitalize())
def test_pairwise_distances_one_dimension_order(metric: str): # Test the pairwise_distance helper function for 1 dimensional cases which # can break down when using a size of 1 for either dimension rng = np.random.RandomState(2) Xc = rng.random_sample((1, 4)) Yc = rng.random_sample((10, 4)) Xf = np.asfortranarray(Xc) Yf = np.asfortranarray(Yc) # For fp64, compare at 13 decimals, (2 places less than the ~15 max) compare_precision = 13 # Compare to sklearn, C/C order S = pairwise_distances(Xc, Yc, metric=metric) S2 = sklearn_pairwise_distances(Xc, Yc, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, C/F order S = pairwise_distances(Xc, Yf, metric=metric) S2 = sklearn_pairwise_distances(Xc, Yf, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, F/C order S = pairwise_distances(Xf, Yc, metric=metric) S2 = sklearn_pairwise_distances(Xf, Yc, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, F/F order S = pairwise_distances(Xf, Yf, metric=metric) S2 = sklearn_pairwise_distances(Xf, Yf, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Switch which input has single dimension Xc = rng.random_sample((1, 4)) Yc = rng.random_sample((10, 4)) Xf = np.asfortranarray(Xc) Yf = np.asfortranarray(Yc) # Compare to sklearn, C/C order S = pairwise_distances(Xc, Yc, metric=metric) S2 = sklearn_pairwise_distances(Xc, Yc, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, C/F order S = pairwise_distances(Xc, Yf, metric=metric) S2 = sklearn_pairwise_distances(Xc, Yf, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, F/C order S = pairwise_distances(Xf, Yc, metric=metric) S2 = sklearn_pairwise_distances(Xf, Yc, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision) # Compare to sklearn, F/F order S = pairwise_distances(Xf, Yf, metric=metric) S2 = sklearn_pairwise_distances(Xf, Yf, metric=metric) cp.testing.assert_array_almost_equal(S, S2, decimal=compare_precision)
def laplacian_kernel(X, Y, gamma=None): if gamma is None: gamma = 1.0 / X.shape[1] K = -gamma * cp.asarray(pairwise_distances(X, Y, metric='manhattan')) cp.exp(K, K) return K
def test_pairwise_distances_output_types(input_type, output_type, use_global): # Test larger sizes to sklearn rng = np.random.RandomState(5) X = rng.random_sample((100, 100)) Y = rng.random_sample((100, 100)) if input_type == "cudf": X = cudf.DataFrame(X) Y = cudf.DataFrame(Y) elif input_type == "cupy": X = cp.asarray(X) Y = cp.asarray(Y) # Set to None if we are using the global object output_type_param = None if use_global else output_type # Use the global manager object. Should do nothing unless use_global is set with cuml.using_output_type(output_type): # Compare to sklearn, fp64 S = pairwise_distances(X, Y, metric="euclidean", output_type=output_type_param) if output_type == "input": assert isinstance(S, type(X)) elif output_type == "cudf": assert isinstance(S, cudf.DataFrame) elif output_type == "numpy": assert isinstance(S, np.ndarray) elif output_type == "cupy": assert isinstance(S, cp.core.core.ndarray)
def _compute_spearman_rho(self, fp_sample, Xt_sample, top_k=100): if hasattr(fp_sample, 'values'): fp_sample = fp_sample.values dist_array_tani = tanimoto_calculate(fp_sample, calc_distance=True) dist_array_eucl = pairwise_distances(Xt_sample) return cupy.nanmean( spearmanr(dist_array_tani, dist_array_eucl, top_k=top_k))
def rbf_kernel(X, Y, gamma=None): if gamma is None: gamma = 1.0 / X.shape[1] K = cp.asarray(pairwise_distances(X, Y, metric='sqeuclidean')) K *= -gamma cp.exp(K, K) return K
def _calculate_metric(self, embeddings, fingerprints, top_k=None): embeddings_dist = pairwise_distances(embeddings) del embeddings fingerprints_dist = tanimoto_calculate(fingerprints, calc_distance=True) del fingerprints corr = spearmanr(fingerprints_dist, embeddings_dist, top_k) return corr
def test_run_spearman_rho(pca_approved_drugs_csv, fingerprint_approved_drugs_csv, cluster_column, n_dims_eucl_data, top_k): """Validate the spearman rho scoring""" # Load PCA data to use as Euclidean distances pca_data = pd.read_csv(pca_approved_drugs_csv).set_index('molregno').drop( cluster_column, axis=1) float_data = pca_data[pca_data.columns[:n_dims_eucl_data]] euclidean_dist = pairwise_distances(cupy.array(float_data)) # Load fingerprints and calculate tanimoto distance fp_data = pd.read_csv(fingerprint_approved_drugs_csv).set_index('molregno') tanimoto_dist = tanimoto_calculate(cupy.array(fp_data), calc_distance=True) # Check all data compared to the CPU version all_data_gpu = spearmanr(tanimoto_dist, euclidean_dist) euclidean_dist_cpu = cupy.asnumpy(euclidean_dist) tanimoto_dist_cpu = cupy.asnumpy(tanimoto_dist) all_data_cpu = _rowwise_numpy_corr(tanimoto_dist_cpu, euclidean_dist_cpu, spearmanr_cpu) cupy.allclose(cupy.array(all_data_cpu), all_data_gpu, atol=0.005, equal_nan=True) # Check using top k calculation compared to the CPU version top_k_data_gpu = spearmanr(tanimoto_dist, euclidean_dist, top_k=top_k, axis=1) cupy.fill_diagonal(tanimoto_dist, cupy.NaN) kth_lim = get_kth_unique_value(tanimoto_dist, top_k, axis=1) mask = tanimoto_dist > kth_lim tanimoto_dist[mask] = cupy.NaN euclidean_dist[mask] = cupy.NaN euclidean_dist_cpu = cupy.asnumpy(euclidean_dist) tanimoto_dist_cpu = cupy.asnumpy(tanimoto_dist) top_k_data_cpu = _rowwise_numpy_corr(tanimoto_dist_cpu, euclidean_dist_cpu, spearmanr_cpu) cupy.allclose(cupy.array(top_k_data_cpu), top_k_data_gpu, atol=0.005, equal_nan=True)
def test_pairwise_distances_exceptions(): rng = np.random.RandomState(4) X_int = rng.randint(10, size=(5, 4)) X_double = rng.random_sample((5, 4)) X_float = np.asfarray(X_double, dtype=np.float32) X_bool = rng.choice([True, False], size=(5, 4)) # Test int inputs (only float/double accepted at this time) with pytest.raises(TypeError): pairwise_distances(X_int, metric="euclidean") # Test second int inputs (should not have an exception with # convert_dtype=True) pairwise_distances(X_double, X_int, metric="euclidean") # Test bool inputs (only float/double accepted at this time) with pytest.raises(TypeError): pairwise_distances(X_bool, metric="euclidean") # Test sending different types with convert_dtype=False with pytest.raises(TypeError): pairwise_distances(X_double, X_float, metric="euclidean", convert_dtype=False) # Invalid metric name with pytest.raises(ValueError): pairwise_distances(X_double, metric="Not a metric") # Invalid dimensions X = rng.random_sample((5, 4)) Y = rng.random_sample((5, 7)) with pytest.raises(ValueError): pairwise_distances(X, Y, metric="euclidean")
def tree_epg( X, Nodes: int = None, init: Optional[DataFrame] = None, lam: Optional[Union[float, int]] = 0.01, mu: Optional[Union[float, int]] = 0.1, trimmingradius: Optional = np.inf, initnodes: int = None, device: str = "cpu", seed: Optional[int] = None, verbose: bool = True, ): try: import elpigraph except Exception as e: warnings.warn('ElPiGraph package is not installed \ \nPlease use "pip install git+https://github.com/j-bac/elpigraph-python.git" to install it' ) logg.hint("parameters used \n" " " + str(Nodes) + " principal points, mu = " + str(mu) + ", lambda = " + str(lam)) if seed is not None: np.random.seed(seed) if device == "gpu": import cupy as cp from cuml.metrics import pairwise_distances from .utils import cor_mat_gpu Tree = elpigraph.computeElasticPrincipalTree( X.values.astype(np.float64), NumNodes=Nodes, Do_PCA=False, InitNodes=initnodes, Lambda=lam, Mu=mu, TrimmingRadius=trimmingradius, GPU=True, verbose=verbose, ) R = pairwise_distances(cp.asarray(X.values), cp.asarray(Tree[0]["NodePositions"])) R = cp.asnumpy(R) # Hard assigment R = sparse.csr_matrix( (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))), R.shape).A else: from .utils import cor_mat_cpu from sklearn.metrics import pairwise_distances Tree = elpigraph.computeElasticPrincipalTree( X.values.astype(np.float64), NumNodes=Nodes, Do_PCA=False, InitNodes=initnodes, Lambda=lam, Mu=mu, TrimmingRadius=trimmingradius, verbose=verbose, ) R = pairwise_distances(X.values, Tree[0]["NodePositions"]) # Hard assigment R = sparse.csr_matrix( (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))), R.shape).A g = igraph.Graph(directed=False) g.add_vertices(np.unique(Tree[0]["Edges"][0].flatten().astype(int))) g.add_edges( pd.DataFrame(Tree[0]["Edges"][0]).astype(int).apply(tuple, axis=1).values) # mat = np.asarray(g.get_adjacency().data) # mat = mat + mat.T - np.diag(np.diag(mat)) # B=((mat>0).astype(int)) B = np.asarray(g.get_adjacency().data) tips = np.argwhere(np.array(g.degree()) == 1).flatten() forks = np.argwhere(np.array(g.degree()) > 2).flatten() graph = { "B": B, "R": R, "F": Tree[0]["NodePositions"].T, "tips": tips, "forks": forks, "cells_fitted": X.index.tolist(), "metrics": "euclidean", } Tree[0]["Edges"] = list(Tree[0]["Edges"]) return graph, Tree[0]
def cosine_similarity(X, Y): K = 1.0 - cp.asarray(pairwise_distances(X, Y, metric='cosine')) return cp.nan_to_num(K, copy=False)
def dbscan_gpu(model, counts_per_word, embeddings=None, sim_thresh=0.8, min_samples=5, min_occs=1000, verbose=False, s2v=False): if embeddings is None: #print('COUNTS PER WORD:', counts_per_word[:, 1]) # Keep only hashtags with more than min_occs occurences nb_to_keep = np.argmax(counts_per_word[:, 1].astype(int) < min_occs) if nb_to_keep == 0: raise Exception( f'dbscan : No word with more than {min_occs} occurences') else: pass #print(f'dbscan : Keepings {nb_to_keep} words with more than {min_occs} occurences') # Create fit data #model_words = set(model.wv.vocab.keys()) if not s2v: model_words = set(model.wv.index_to_key) else: model_words = set(model.keys()) words_kept = np.array([ word for word, count in counts_per_word[:nb_to_keep] if word in model_words ]) #print('1- len(words_kept) :', len(words_kept)) X = cudf.DataFrame() if s2v: transposed = np.array([model[w] for w in words_kept]).transpose() else: transposed = np.array([model.wv[w] for w in words_kept]).transpose() for e, v in enumerate(transposed): X[e] = v X = pairwise_distances(X, metric='cosine') else: X = cudf.DataFrame() for e, v in enumerate(embeddings.transpose()): X[e] = v X = pairwise_distances(X, metric='cosine') words_kept = np.arange(len(embeddings)).astype(str) #print('2- len(words_kept) :', len(words_kept)) # cosine DBScan #clustering = DBSCAN(eps=1-sim_thresh, min_samples=min_samples, metric='cosine').fit(X) #clust_labels = clustering.labels_ # Setup and fit clusters # Create and populate a GPU DataFrame #print('len(X):', len(X)) clustering = DBSCAN(eps=1 - sim_thresh, min_samples=min_samples, metric="precomputed").fit(X) clust_labels = clustering.labels_.to_array() #print('labels :', clust_labels) #.to_pandas().values #print('len(clust_labels) :', len(clust_labels)) if verbose: print(np.bincount(clust_labels + 1)[1:]) for e in range(clust_labels.max() + 1): print(f"Topic {e} :") tags = np.array(counts_per_word)[:len(clust_labels)][clust_labels == e] for tag in tags: print(f"\t{tag}") return clust_labels, words_kept
def curve_epg( adata: AnnData, Nodes: int = None, use_rep: str = None, ndims_rep: Optional[int] = None, init: Optional[DataFrame] = None, lam: Optional[Union[float, int]] = 0.01, mu: Optional[Union[float, int]] = 0.1, trimmingradius: Optional = np.inf, initnodes: int = None, device: str = "cpu", seed: Optional[int] = None, verbose: bool = True, ): try: import elpigraph except Exception as e: warnings.warn('ElPiGraph package is not installed \ \nPlease use "pip install git+https://github.com/j-bac/elpigraph-python.git" to install it' ) X = get_data(adata, use_rep, ndims_rep) if seed is not None: np.random.seed(seed) if device == "gpu": import cupy as cp from .utils import cor_mat_gpu from cuml.metrics import pairwise_distances Curve = elpigraph.computeElasticPrincipalCurve( X.values.astype(np.float64), NumNodes=Nodes, Do_PCA=False, InitNodes=initnodes, Lambda=lam, Mu=mu, TrimmingRadius=trimmingradius, GPU=True, verbose=verbose, ) R = pairwise_distances(cp.asarray(X.values), cp.asarray(Curve[0]["NodePositions"])) R = cp.asnumpy(R) # Hard assigment R = sparse.csr_matrix( (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))), R.shape).A else: from .utils import cor_mat_cpu from sklearn.metrics import pairwise_distances Curve = elpigraph.computeElasticPrincipalCurve( X.values.astype(np.float64), NumNodes=Nodes, Do_PCA=False, InitNodes=initnodes, Lambda=lam, Mu=mu, TrimmingRadius=trimmingradius, verbose=verbose, ) R = pairwise_distances(X.values, Curve[0]["NodePositions"]) # Hard assigment R = sparse.csr_matrix( (np.repeat(1, R.shape[0]), (range(R.shape[0]), R.argmin(axis=1))), R.shape).A g = igraph.Graph(directed=False) g.add_vertices(np.unique(Curve[0]["Edges"][0].flatten().astype(int))) g.add_edges( pd.DataFrame(Curve[0]["Edges"][0]).astype(int).apply(tuple, axis=1).values) # mat = np.asarray(g.get_adjacency().data) # mat = mat + mat.T - np.diag(np.diag(mat)) # B=((mat>0).astype(int)) B = np.asarray(g.get_adjacency().data) tips = np.argwhere(np.array(g.degree()) == 1).flatten() forks = np.argwhere(np.array(g.degree()) > 2).flatten() graph = { "B": B, "R": R, "F": Curve[0]["NodePositions"].T, "tips": tips, "forks": forks, "cells_fitted": X.index.tolist(), "metrics": "euclidean", } Curve[0]["Edges"] = list(Curve[0]["Edges"]) adata.uns["graph"] = graph adata.uns["epg"] = Curve[0] logg.info(" finished", time=True, end=" " if settings.verbosity > 2 else "\n") logg.hint( "added \n" " .uns['epg'] dictionnary containing inferred elastic curve generated from elpigraph.\n" " .uns['graph']['B'] adjacency matrix of the principal points.\n" " .uns['graph']['R'] hard assignment of cells to principal point in representation space.\n" " .uns['graph']['F'], coordinates of principal points in representation space." ) return adata
def score_samples(self, X): """Compute the log-likelihood of each sample under the model. Parameters ---------- X : array-like of shape (n_samples, n_features) An array of points to query. Last dimension should match dimension of training data (n_features). Returns ------- density : ndarray of shape (n_samples,) Log-likelihood of each sample in `X`. These are normalized to be probability densities, so values will be low for high-dimensional data. """ if not hasattr(self, "X_"): raise NotFittedError() X_cuml = input_to_cuml_array(X) if self.metric_params: if len(self.metric_params) != 1: raise ValueError( "Cuml only supports metrics with a single arg.") metric_arg = list(self.metric_params.values())[0] distances = pairwise_distances(X_cuml.array, self.X_, metric=self.metric, metric_arg=metric_arg) else: distances = pairwise_distances(X_cuml.array, self.X_, metric=self.metric) distances = cp.asarray(distances) h = self.bandwidth if self.kernel in log_probability_kernels_: distances = log_probability_kernels_[self.kernel](distances, h) else: raise ValueError("Unsupported kernel.") log_probabilities = cp.zeros(distances.shape[0]) if self.sample_weight_ is not None: distances += cp.log(self.sample_weight_) logsumexp_kernel.forall(log_probabilities.size)(distances, log_probabilities) # Note that sklearns user guide is wrong # It says the (unnormalised) probability output for # the kernel density is sum(K(x,h)). # In fact what they implment is (1/n)*sum(K(x,h)) # Here we divide by n in normal probability space # Which becomes -log(n) in log probability space sum_weights = (cp.sum(self.sample_weight_) if self.sample_weight_ is not None else distances.shape[1]) log_probabilities -= np.log(sum_weights) # norm if len(X_cuml.array.shape) == 1: # if X is one dimensional, we have 1 feature dimension = 1 else: dimension = X_cuml.array.shape[1] log_probabilities = norm_log_probabilities(log_probabilities, self.kernel, h, dimension) return log_probabilities