예제 #1
0
    def build_ann_index(self, fingerprints, nthreads=1, overWrite=False):
        """WARNING: set threads correctly! I set it to 1 so you don't run out of memory.
        This builds an approximate nearest neighbors index, used to build a kNN graph.
        n2 is a good choice because it is fast and also allows streaming upload. Further,
        it outperforms many other libraries according to ann_benchmarks. n2 is awesome.
        It does not, however, offer dice, jaccard, or tanimoto. In practice cosine works fine."""

        index_file = Path("../processed_data/" + self.fingerprint_kind +
                          '_n2_index.hnsw')

        if index_file.is_file() and not overWrite:
            raise Exception(
                'Index file exists already. Set `overWrite` to true to re-write it'
            )
        else:
            pass

        if not isinstance(fingerprints, np.ndarray):
            if self.verbose:
                print('converting to numpy')
            fingerprints = fingerprints.toarray()

        if self.verbose:
            print('adding vector data to n2')

        index = n2.HnswIndex(self.dimension, "angular")
        for fp in tqdm.tqdm(fingerprints, smoothing=0):
            index.add_data(fp)

        if self.verbose:
            print(f'building index with {nthreads}')

        index.build(n_threads=nthreads)
        index.save('../processed_data/' + self.fingerprint_kind +
                   '_n2_index.hnsw')
예제 #2
0
 def fit(self, X):
     self._n2 = n2.HnswIndex(X.shape[1], self._metric)
     for x in X:
         self._n2.add_data(x)
     self._n2.build(m=self._m,
                    max_m0=self._m0,
                    ef_construction=self._ef_construction,
                    n_threads=self._n_threads,
                    graph_merging='merge_level0')
예제 #3
0
    def build_ann_index(self, nthreads=1):
        """WARNING: set threads correctly! I set it to 1 so you don't run out of memory.
        This builds an approximate nearest neighbors index, used to build a kNN graph.
        n2 is a good choice because it is fast and also allows streaming upload. Further,
        it outperforms many other libraries according to ann_benchmarks. n2 is awesome.
        It does not, however, offer dice, jaccard, or tanimoto. In practice cosine works fine."""

        if self.verbose:
            print('adding vector data to n2')
        index = n2.HnswIndex(self.fpsize, "angular")
        for fp in self.fingerprints:
            index.add_data(fp)

        if self.verbose:
            print(f'building index with {nthreads}')
            
        index.build(n_threads=nthreads)
        index.save('../processed_data/n2_index.hnsw')
예제 #4
0
    def build_knn_graph(self, k):
        """Builds a kNN graph using the approx. NN index built earlier. In practice,
        in most nearest neighbor settings going above k=25 doesn't reall add any benefit."""

        if self.verbose:
            print(f'constructing kNN graph with k={k}')
            
        index = n2.HnswIndex(self.fpsize, "angular")
        index.load('../processed_data/n2_index.hnsw')

        data = list()
        indices = list()
        indptr = list()
        count = 0
        indptr.append(count)
        
        for i in tqdm(range(self.num_ligs)):
            neighbor_idx = index.search_by_id(i,k,100, include_distances=True)[1:]
            for nidx, distance in neighbor_idx:
                data.append(1-distance)
                indices.append(nidx)
                count+=1
            indptr.append(count)

        self.adj = sparse.csr_matrix( ( data, indices, indptr), shape=(self.num_ligs, self.num_ligs), dtype=np.float16)

        #do a check that the values in the adjacency matrix are in the right place:
        for _ in range(50):
            idx = np.random.choice(self.num_ligs)
            adjacency_indices = self.adj[idx].indices
            adjacency_distances = 1-self.adj[idx].data
            query = index.search_by_id(idx, k, 100, include_distances=True)[1:]
            index_indices = [i[0] for i in query]
            index_distances = [i[1] for i in query]
            assert np.allclose(index_distances, adjacency_distances, atol=1e-3) #high tolerance because np.float16 conversion.
            assert np.allclose(adjacency_indices, index_indices)
예제 #5
0
 def fit(self, X):
     self._n2 = n2.HnswIndex(X.shape[1], self._metric)
     for x in X:
         self._n2.add_data(x.tolist())
     self._n2.build(self._max_m0)