def test_parallel(): np.random.seed(42) X = np.random.random((10**3, 10)) distances = ['euclidean', 'cosine', 'correlation', 'inner_product'] n_threads = [1, 2] for distance in distances: print("Distance:", distance) times = {} for n_th in n_threads: vis = ncvis.NCVis(n_neighbors=15, M=16, ef_construction=200, random_seed=42, n_init_epochs=20, n_epochs=50, min_dist=0.4, n_threads=n_th, distance=distance) start = time.time() Y = vis.fit_transform(X) stop = time.time() times[n_th] = stop - start print("n_threads = {}, time = {:.2f}s".format(n_th, times[n_th])) if n_th > 1: eff = times[1] / (times[n_th] * n_th) assert eff > 0.3, "Parallelization efficiency is too low"
def test_1d_clustering(): np.random.seed(42) n = 50 X = np.concatenate( (np.random.normal(-1, 1.5, (n, 1)), np.random.normal(1, 1.5, (n, 1)))) vis = ncvis.NCVis(n_neighbors=15, M=16, ef_construction=200, d=1, n_init_epochs=20, n_epochs=50, min_dist=0.4, n_threads=-1, distance="euclidean", random_seed=42) Y = vis.fit_transform(X).ravel() n_pos = np.count_nonzero(Y - Y.mean() > 0) assert np.abs(n_pos - n) < 5, "Clustering quality is too poor"
def test_distances(): np.random.seed(42) X = np.random.random((5, 3)) distances = ['euclidean', 'cosine', 'correlation', 'inner_product'] for distance in distances: vis = ncvis.NCVis(n_neighbors=15, M=16, ef_construction=200, random_seed=42, n_init_epochs=20, n_epochs=50, min_dist=0.4, n_threads=-1, distance=distance) Y = vis.fit_transform(X) all_finite = np.all(np.isfinite(Y)) print("Distance:", distance) print("Input:") print(X) print("Output:") print(Y) assert all_finite, "All entries must be finite"
def __init__(self, d: int = 2, random_state: int = 0, **kwargs): import ncvis super().__init__(d, random_state) self._main = ncvis.NCVis(d=d, random_seed=random_state, **kwargs)
def NCVis( data, n_components=2, n_jobs=-1, n_neighbors=15, distance="cosine", M=15, efC=30, random_seed=42, n_epochs=50, n_init_epochs=20, spread=1.0, min_dist=0.4, alpha=1.0, a=None, b=None, alpha_Q=1., n_noise=None, ): """ Runs Noise Contrastive Visualization ((NCVis)[https://dl.acm.org/doi/abs/10.1145/3366423.3380061]) for dimensionality reduction and graph layout . Parameters ---------- n_components : int Desired dimensionality of the embedding. n_jobs : int The maximum number of threads to use. In case n_threads < 1, it defaults to the number of available CPUs. n_neighbors : int Number of nearest neighbours in the high dimensional space to consider. M : int The number of bi-directional links created for every new element during construction of HNSW. See https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md efC : int The size of the dynamic list for the nearest neighbors (used during the search) in HNSW. See https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md random_seed : int Random seed to initialize the generators. Notice, however, that the result may still depend on the number of threads. n_epochs : int The total number of epochs to run. During one epoch the positions of each nearest neighbors pair are updated. n_init_epochs : int The number of epochs used for initialization. During one epoch the positions of each nearest neighbors pair are updated. spread : float The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are. See https://github.com/lmcinnes/umap/blob/834184f9c0455f26db13ab148c0abd2d3767d968/umap/umap_.py#L1143 min_dist : float The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. See https://github.com/lmcinnes/umap/blob/834184f9c0455f26db13ab148c0abd2d3767d968/umap/umap_.py#L1135 a : (optional, default None) More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``. See https://github.com/lmcinnes/umap/blob/834184f9c0455f26db13ab148c0abd2d3767d968/umap/umap_.py#L1179 b : (optional, default None) More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``. See https://github.com/lmcinnes/umap/blob/834184f9c0455f26db13ab148c0abd2d3767d968/umap/umap_.py#L1183 alpha : float Learning rate for the embedding positions. alpha_Q : float Learning rate for the normalization constant. n_noise : int or ndarray of ints Number of noise samples to use per data sample. If ndarray is provided, n_epochs is set to its length. If n_noise is None, it is set to dynamic sampling with noise level gradually increasing from 0 to fixed value. distance : str {'euclidean', 'cosine', 'correlation', 'inner_product'} Distance to use for nearest neighbors search. """ try: import ncvis except ImportError( 'NCVis is needed for this embedding. Install it with `pip install ncvis`' ): return print( 'NCVis is needed for this embedding. Install it with `pip install ncvis`' ) ncvis_emb = ncvis.NCVis(d=n_components, n_threads=n_jobs, n_neighbors=n_neighbors, M=M, ef_construction=efC, random_seed=random_seed, n_epochs=n_epochs, n_init_epochs=n_init_epochs, spread=spread, min_dist=min_dist, a=a, b=b, alpha=alpha, alpha_Q=alpha_Q, n_noise=n_noise, distance=distance).fit_transform(data) return ncvis_emb
import ncvis vis = ncvis.NCVis(n_neighbors=15, M=16, ef_construction=200, n_init_epochs=20, n_epochs=50, min_dist=0.4, n_threads=-1, distance='euclidean')