Пример #1
0
    def build(self, data, k, cp):
        n_items, vector_length = data.shape
        #initalize parameters
        self.method_param = init_method_param("annoy", data=data, cp=cp)
        ntrees = self.method_param["n_trees"]
        #build index
        self.index = annoy.AnnoyIndex(vector_length, metric=self.metric)
        for i in range(n_items):
            self.index.add_item(i, data[i])
        self.index.build(ntrees)

        #   def query_train(self, data, k):
        #add search_k parameter: tradeoff between speed and accuracy?
        #neighbors_single, distances_single = np.asarray(self.index.get_nns_by_vector(data[i], n=k, search_k=-1, include_distances=True))
        #output array with points x neighbors:
        neighbors = np.empty((data.shape[0], k), dtype=int)
        distances = np.empty((data.shape[0], k))
        for i in range(len(data)):
            neighbors_single, distances_single = np.asarray(
                self.index.get_nns_by_item(i,
                                           n=k,
                                           search_k=-1,
                                           include_distances=True))
            neighbors[i] = neighbors_single
            distances[i] = distances_single
        #print("neighbors.shape: {}".format(neighbors.shape))
        #print("neighbors[0]: {}".format(neighbors[0]))
        #print(neighbors.shape)
        #print("distances.shape: {}".format(distances.shape))
        #print("distances[0]: {}".format(distances[0]))
        return neighbors, distances
Пример #2
0
    def build(self, data, k, cp):
        n_items, vector_length = data.shape
        self._method_name = "napp"
        method_param = init_method_param(self._method_name, data=data, cp=cp)
        self._index_param = method_param["index_param"]
        self._index_param["indexThreadQty"] = self.n_jobs
        #self._query_param = method_param["query_param"]
        self._metric = {
            'angular': 'cosinesimil',
            'euclidean': 'l2'
        }[self.metric]

        self.index = nmslib.init(space=self._metric,
                                 method=self._method_name,
                                 data_type=nmslib.DataType.DENSE_VECTOR,
                                 dtype=nmslib.DistType.FLOAT)
        self.index.addDataPointBatch(data)
        self.index.createIndex(self._index_param)
        self.index.setQueryTimeParams()  #self._query_param)

        # def query_train(self, data, k):
        result = np.asarray(self.index.knnQueryBatch(data, k))
        neighbors = np.empty((data.shape[0], k), dtype=int)
        distances = np.empty((data.shape[0], k))
        for i in range(len(data)):
            neighbors[i] = result[i][0]
            distances[i] = result[i][1]
        return neighbors, distances
Пример #3
0
    def build(self, data, k, cp):
        n_items, vector_length = data.shape
        #print(data.shape)
        #parameters init
        method_param = init_method_param("nearpy", data=data, cp=cp)
        hash_counts = method_param["hash_counts"]
        n_bits = method_param["n_bits"]

        self.filter = NearestFilter(10)

        hashes = []
        for k in range(hash_counts):
            nearpy_rbp = nearpy.hashes.RandomBinaryProjections(
                'rbp_%d' % k, n_bits)
            hashes.append(nearpy_rbp)

        if self.metric == 'euclidean':
            dist = nearpy.distances.EuclideanDistance()
            self.index = nearpy.Engine(
                vector_length,
                lshashes=hashes,
                distance=dist,
                vector_filters=[self.filter])
        else:  # Default (angular) = Cosine distance
            self.index = nearpy.Engine(
                vector_length,
                lshashes=hashes,
                vector_filters=[self.filter])

        #if self.metric == 'angular':
            #data = sklearn.preprocessing.normalize(data, axis=1, norm='l2')
        for i, x in enumerate(data):
            self.index.store_vector(x, i)

        # def query_train(self, data, k):
        self.filter.N = k
        #if self.metric == 'angular':
            #data = sklearn.preprocessing.normalize([data], axis=1, norm='l2')[0]

        neighbors = np.empty((data.shape[0],k), dtype=int)
        distances = np.empty((data.shape[0],k))

        for i in range(len(data)):
            item_single = self.index.neighbours(data[i])
            dp_n = []
            dp_d = []
            for j in range(len(item_single)):
                dp_n.append(item_single[j][1])
                dp_d.append(item_single[j][2])
            neighbors[i] = np.asarray(dp_n)
            distances[i] = np.asarray(dp_d)

        return neighbors, distances