def build_ann_index(self, fingerprints, nthreads=1, overWrite=False): """WARNING: set threads correctly! I set it to 1 so you don't run out of memory. This builds an approximate nearest neighbors index, used to build a kNN graph. n2 is a good choice because it is fast and also allows streaming upload. Further, it outperforms many other libraries according to ann_benchmarks. n2 is awesome. It does not, however, offer dice, jaccard, or tanimoto. In practice cosine works fine.""" index_file = Path("../processed_data/" + self.fingerprint_kind + '_n2_index.hnsw') if index_file.is_file() and not overWrite: raise Exception( 'Index file exists already. Set `overWrite` to true to re-write it' ) else: pass if not isinstance(fingerprints, np.ndarray): if self.verbose: print('converting to numpy') fingerprints = fingerprints.toarray() if self.verbose: print('adding vector data to n2') index = n2.HnswIndex(self.dimension, "angular") for fp in tqdm.tqdm(fingerprints, smoothing=0): index.add_data(fp) if self.verbose: print(f'building index with {nthreads}') index.build(n_threads=nthreads) index.save('../processed_data/' + self.fingerprint_kind + '_n2_index.hnsw')
def fit(self, X): self._n2 = n2.HnswIndex(X.shape[1], self._metric) for x in X: self._n2.add_data(x) self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads, graph_merging='merge_level0')
def build_ann_index(self, nthreads=1): """WARNING: set threads correctly! I set it to 1 so you don't run out of memory. This builds an approximate nearest neighbors index, used to build a kNN graph. n2 is a good choice because it is fast and also allows streaming upload. Further, it outperforms many other libraries according to ann_benchmarks. n2 is awesome. It does not, however, offer dice, jaccard, or tanimoto. In practice cosine works fine.""" if self.verbose: print('adding vector data to n2') index = n2.HnswIndex(self.fpsize, "angular") for fp in self.fingerprints: index.add_data(fp) if self.verbose: print(f'building index with {nthreads}') index.build(n_threads=nthreads) index.save('../processed_data/n2_index.hnsw')
def build_knn_graph(self, k): """Builds a kNN graph using the approx. NN index built earlier. In practice, in most nearest neighbor settings going above k=25 doesn't reall add any benefit.""" if self.verbose: print(f'constructing kNN graph with k={k}') index = n2.HnswIndex(self.fpsize, "angular") index.load('../processed_data/n2_index.hnsw') data = list() indices = list() indptr = list() count = 0 indptr.append(count) for i in tqdm(range(self.num_ligs)): neighbor_idx = index.search_by_id(i,k,100, include_distances=True)[1:] for nidx, distance in neighbor_idx: data.append(1-distance) indices.append(nidx) count+=1 indptr.append(count) self.adj = sparse.csr_matrix( ( data, indices, indptr), shape=(self.num_ligs, self.num_ligs), dtype=np.float16) #do a check that the values in the adjacency matrix are in the right place: for _ in range(50): idx = np.random.choice(self.num_ligs) adjacency_indices = self.adj[idx].indices adjacency_distances = 1-self.adj[idx].data query = index.search_by_id(idx, k, 100, include_distances=True)[1:] index_indices = [i[0] for i in query] index_distances = [i[1] for i in query] assert np.allclose(index_distances, adjacency_distances, atol=1e-3) #high tolerance because np.float16 conversion. assert np.allclose(adjacency_indices, index_indices)
def fit(self, X): self._n2 = n2.HnswIndex(X.shape[1], self._metric) for x in X: self._n2.add_data(x.tolist()) self._n2.build(self._max_m0)