示例#1
0
    def build(self, data, k, cp):
        n_items, vector_length = data.shape
        #print(data.shape)
        #parameters init
        method_param = init_method_param("nearpy", data=data, cp=cp)
        hash_counts = method_param["hash_counts"]
        n_bits = method_param["n_bits"]

        self.filter = NearestFilter(10)

        hashes = []
        for k in range(hash_counts):
            nearpy_rbp = nearpy.hashes.RandomBinaryProjections(
                'rbp_%d' % k, n_bits)
            hashes.append(nearpy_rbp)

        if self.metric == 'euclidean':
            dist = nearpy.distances.EuclideanDistance()
            self.index = nearpy.Engine(
                vector_length,
                lshashes=hashes,
                distance=dist,
                vector_filters=[self.filter])
        else:  # Default (angular) = Cosine distance
            self.index = nearpy.Engine(
                vector_length,
                lshashes=hashes,
                vector_filters=[self.filter])

        #if self.metric == 'angular':
            #data = sklearn.preprocessing.normalize(data, axis=1, norm='l2')
        for i, x in enumerate(data):
            self.index.store_vector(x, i)

        # def query_train(self, data, k):
        self.filter.N = k
        #if self.metric == 'angular':
            #data = sklearn.preprocessing.normalize([data], axis=1, norm='l2')[0]

        neighbors = np.empty((data.shape[0],k), dtype=int)
        distances = np.empty((data.shape[0],k))

        for i in range(len(data)):
            item_single = self.index.neighbours(data[i])
            dp_n = []
            dp_d = []
            for j in range(len(item_single)):
                dp_n.append(item_single[j][1])
                dp_d.append(item_single[j][2])
            neighbors[i] = np.asarray(dp_n)
            distances[i] = np.asarray(dp_d)

        return neighbors, distances
示例#2
0
    def fit(self, X):
        import nearpy

        hashes = []

        for k in xrange(self._hash_counts):
            nearpy_rbp = nearpy.hashes.RandomBinaryProjections('rbp_%d' % k, self._n_bits)
            hashes.append(nearpy_rbp)

        if self._metric == 'euclidean':
            dist = nearpy.distances.EuclideanDistance()
            self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes, distance=dist)
        else: # Default (angular) = Cosine distance
            self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes)

        if self._metric == 'angular':
            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')
        for i, x in enumerate(X):
            self._nearpy_engine.store_vector(x.tolist(), i)
    def fit(self, X):
        hashes = []

        for k in xrange(self._hash_counts):
            nearpy_rbp = nearpy.hashes.RandomBinaryProjections(
                'rbp_%d' % k, self._n_bits)
            hashes.append(nearpy_rbp)

        self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes)

        for i, x in enumerate(X):
            self._nearpy_engine.store_vector(x.tolist(), i)
 def __LSH__(self, global_features, query_global_desc, buckets=5, n_neighbors=3):
     engines = {}
     engines[buckets] = nearpy.Engine(global_features.shape[1], lshashes=[nearpy.hashes.RandomBinaryProjections('rbp', buckets)])
     for i, v in enumerate(global_features):
         engines[buckets].store_vector(v, '%d'%i)
     indices = []
     for d in tqdm.tqdm(query_global_desc, total=query_global_desc.shape[0]):
         nbr = engines[buckets].neighbours(d)
         if len(nbr) > (n_neighbors):
             indices.append(np.array([int(n[1]) for n in nbr]))
         else:
             b = buckets
             while (len(nbr) <= n_neighbors and b > 1):
                 b = b // 2
                 if b not in engines:
                     print('Create new engine with {:d} buckets'.format(b))
                     engines[b] = nearpy.Engine(global_features.shape[1], lshashes=[nearpy.hashes.RandomBinaryProjections('rbp', b)])
                     for i, v in enumerate(global_features):
                         engines[b].store_vector(v, '%d'%i)
                 nbr = engines[b].neighbours(d)
                 indices.append(np.array([int(n[1]) for n in nbr]))    
     return np.array(indices)
示例#5
0
    def fit(self, X):
        import nearpy, nearpy.hashes, nearpy.distances

        hashes = []

        # TODO: doesn't seem like the NearPy code is using the metric??
        for k in xrange(self._hash_counts):
            nearpy_rbp = nearpy.hashes.RandomBinaryProjections(
                'rbp_%d' % k, self._n_bits)
            hashes.append(nearpy_rbp)

        self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes)

        for i, x in enumerate(X):
            self._nearpy_engine.store_vector(x.tolist(), i)
def build_lsh_engine(orig, window_size, number_of_hashes, hash_dimensions):
    # Build the ngram vectors using rolling windows.
    # Variables named `*_win_vectors` contain vectors for
    # the given input, such that each row is the vector
    # for a single window. Successive windows overlap
    # at all words except for the first and last.

    orig_vectors = mk_vectors(orig)
    orig_win_vectors = numpy.array([
        orig_vectors[i:i + window_size, :].ravel()
        for i in range(orig_vectors.shape[0] - window_size + 1)
    ])

    # Initialize the approximate nearest neighbor search algorithm.
    # This creates the search "engine" and populates its index with
    # the window-vectors from the original script. We can then pass
    # over the window-vectors from a fan work, taking each vector
    # and searching for good matches in the engine's index of script
    # text.

    # We could do the search in the opposite direction, storing
    # fan text in the engine's index, and passing over window-
    # vectors from the original script, searching for matches in
    # the index of fan text. Unfortuantely, the quality of the
    # matches found goes down when you add too many values to the
    # engine's index.
    vector_dim = orig_win_vectors.shape[1]

    hashes = []
    for i in range(number_of_hashes):
        h = nearpy.hashes.RandomBinaryProjections('rbp{}'.format(i),
                                                  hash_dimensions)
        hashes.append(h)

    engine = nearpy.Engine(vector_dim,
                           lshashes=hashes,
                           distance=nearpy.distances.CosineDistance())

    for ix, row in enumerate(orig_win_vectors):
        engine.store_vector(row, (ix, str(orig[ix:ix + window_size])))
    return engine