Пример #1
0
  def init_lsh(self):
    """
    Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
    """
    self.query_objects = {
    }  # contains the object that can be queried to find nearest neighbors at each layer.
    # mean of training data representation per layer (that needs to be substracted before LSH).
    self.centers = {}
    for layer in self.layers:
      assert self.nb_tables >= self.neighbors

      # Normalize all the lenghts, since we care about the cosine similarity.
      self.train_activations_lsh[layer] /= np.linalg.norm(
          self.train_activations_lsh[layer], axis=1).reshape(-1, 1)

      # Center the dataset and the queries: this improves the performance of LSH quite a bit.
      center = np.mean(self.train_activations_lsh[layer], axis=0)
      self.train_activations_lsh[layer] -= center
      self.centers[layer] = center

      # LSH parameters
      params_cp = falconn.LSHConstructionParameters()
      params_cp.dimension = len(self.train_activations_lsh[layer][1])
      params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
      params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
      params_cp.l = self.nb_tables
      params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
      params_cp.seed = 5721840
      # we want to use all the available threads to set up
      params_cp.num_setup_threads = 0
      params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

      # we build 18-bit hashes so that each table has
      # 2^18 bins; this is a good choice since 2^18 is of the same
      # order of magnitude as the number of data points
      falconn.compute_number_of_hash_functions(self.number_bits, params_cp)

      print('Constructing the LSH table')
      table = falconn.LSHIndex(params_cp)
      table.setup(self.train_activations_lsh[layer])

      # Parse test feature vectors and find k nearest neighbors
      query_object = table.construct_query_object()
      query_object.set_num_probes(self.nb_tables)
      self.query_objects[layer] = query_object
Пример #2
0
def init_hash():
    # 获得数组
    train=np.array(load_all_beOne(path))
    # 获取数组数量
    trainNum=len(train)
    # 获得默认参数
    p=falconn.get_default_parameters(trainNum, dim)
    t=falconn.LSHIndex(p)
    dataset=[np.ravel(x[0]).astype(np.float32) for x in train]
    dataset=np.array(dataset)
    # 生成hash
    logging.info('Start Hash setup')
    t.setup(dataset)
    if is_pool:
        q=t.construct_query_pool()
    else:
        q=t.construct_query_object()
    return (q, train)
Пример #3
0
    def __init__(self, feature_file, label_file, id_feature_file,
                 id_label_file):
        self.idfeature = np.load(id_feature_file)
        self.idlabel = np.load(id_label_file)

        self.label = np.load(label_file)
        print "start load feature data"
        t1 = time.time()
        feature = np.load(feature_file)
        t2 = time.time()
        print("load cost time:%f" % (t2 - t1))
        dp = fc.get_default_parameters(feature.shape[0], feature.shape[1],
                                       fc.DistanceFunction.EuclideanSquared)
        ds = fc.LSHIndex(dp)
        train_st = time.time()
        ds.setup(feature)
        train_et = time.time()
        print("train cost time:%f" % (train_et - train_st))
        self.qo = ds.construct_query_object()
Пример #4
0
    def fit(self, X: np.ndarray, y: np.ndarray = None):
        """ Setup the LSH index from training data.

        Parameters
        ----------
        X: np.array
            Data to be indexed
        y: any
            Ignored

        Returns
        -------
        self: FalconnLSH
            An instance of LSH with a built index
        """
        X = check_array(X, dtype=[np.float32, np.float64])

        if self.metric in ['euclidean', 'l2', 'minkowski']:
            self.metric = 'euclidean'
            distance = falconn.DistanceFunction.EuclideanSquared
        elif self.metric in ['squared_euclidean', 'sqeuclidean']:
            self.metric = 'sqeuclidean'
            distance = falconn.DistanceFunction.EuclideanSquared
        elif self.metric in ['cosine', 'NegativeInnerProduct', 'neg_inner']:
            self.metric = 'cosine'
            distance = falconn.DistanceFunction.NegativeInnerProduct
        else:
            warnings.warn(
                f'Invalid metric "{self.metric}". Using "euclidean" instead')
            self.metric = 'euclidean'
            distance = falconn.DistanceFunction.EuclideanSquared

        # Set up the LSH index
        lsh_construction_params = falconn.get_default_parameters(
            *X.shape, distance=distance)
        lsh_index = falconn.LSHIndex(lsh_construction_params)
        lsh_index.setup(X)

        self.X_train_ = X
        self.y_train_ = y
        self.index_ = lsh_index

        return self
Пример #5
0
 def load_identifier(self,labelFile,featuresFile):
     self.label = np.load( labelFile)
     print "start load feature data"
     print(labelFile)
     t1 = time.time()
     self.feature = np.load(featuresFile)
     self.embs = self.feature
     print ("feature dtype:%d", self.feature.dtype)
     t2 = time.time()
     print ("load cost time:%f" % (t2 - t1))
     self.dp = fc.get_default_parameters(self.feature.shape[0], self.feature.shape[1],
                                         fc.DistanceFunction.EuclideanSquared)
     self.dp.l = 30
     self.ds = fc.LSHIndex(self.dp)
     train_st = time.time()
     self.ds.setup(self.feature)
     train_et = time.time()
     print ("train cost time:%f" % (train_et - train_st))
     self.qo = self.ds.construct_query_object()
Пример #6
0
def setup_lsh(X, num_probes=100):
    assert X.ndim == 2
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = X.shape[1]
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 100
    params_cp.num_rotations = 1
    params_cp.seed = 1234
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    falconn.compute_number_of_hash_functions(16, params_cp)

    table = falconn.LSHIndex(params_cp)
    table.setup(X)
    query_object = table.construct_query_object()
    query_object.set_num_probes(num_probes)

    return query_object
Пример #7
0
def _create_bucket(segments):
    """ Creates a bucket of segments
    to use for LSH similarity lookup
    """
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(segments[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 25
    params_cp.num_rotations = 2
    params_cp.seed = 5721840
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = (
        falconn.StorageHashTable.BitPackedFlatHashTable)
    falconn.compute_number_of_hash_functions(18, params_cp)

    table = falconn.LSHIndex(params_cp)
    table.setup(segments)

    return (segments, table)
Пример #8
0
def test_lsh_index_positive():
    n = 1000
    d = 128
    p = falconn.get_default_parameters(n, d)
    t = falconn.LSHIndex(p)
    dataset = np.random.randn(n, d).astype(np.float32)
    t.fit(dataset)
    u = np.random.randn(d).astype(np.float32)
    t.find_k_nearest_neighbors(u, 10)
    t.find_near_neighbors(u, 10.0)
    t.find_nearest_neighbor(u)
    t.get_candidates_with_duplicates(u)
    t.get_max_num_candidates()
    t.get_num_probes()
    t.get_query_statistics()
    t.get_unique_candidates(u)
    t.get_unique_sorted_candidates(u)
    t.reset_query_statistics()
    t.set_max_num_candidates(100)
    t.set_num_probes(10)
Пример #9
0
def lsh_sieve(full_deltas, d, n):

    deltas = np.reshape(full_deltas, (n, d))
    centred_deltas = (deltas - np.mean(deltas, axis=0))

    params = falconn.get_default_parameters(n, d)
    fln = falconn.LSHIndex(params)
    fln.setup(centred_deltas)
    qob = fln.construct_query_object()

    # Greedy merge within a distance
    # all_sets = list()

    full_grad = np.zeros(d)

    for i in range(n):
        neighbors = qob.find_near_neighbors(centred_deltas[i], 1.0 / d)
        # print str(i) + " has " + str(neighbors)
        full_grad = full_grad + (deltas[i] / len(neighbors))

    return full_grad
Пример #10
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     if self._metric == 'angular':
         X /= numpy.linalg.norm(X, axis=1).reshape(-1,  1)
     self._center = numpy.mean(X, axis=0)
     X -= self._center
     import falconn
     self._params = falconn.LSHConstructionParameters()
     self._params.dimension = X.shape[1]
     self._params.distance_function = 'euclidean_squared'
     self._params.lsh_family = 'cross_polytope'
     falconn.compute_number_of_hash_functions(self._num_bits, self._params)
     self._params.l = self._num_tables
     self._params.num_rotations = 1
     self._params.num_setup_threads = 0
     self._params.storage_hash_table = 'flat_hash_table'
     self._params.seed = 95225714
     self._index = falconn.LSHIndex(self._params)
     self._index.setup(X)
     self._index.set_num_probes(self._num_probes)
     self._buf = numpy.zeros((X.shape[1],), dtype=numpy.float32)
Пример #11
0
    def setup_second_layer(self, number_of_tables=50):
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = self.X.shape[1] + 1
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(15, params_cp)

        print('Constructing the LSH table')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        self.X_ = self.X_.astype('float')
        table.setup(self.X_)
        t2 = timeit.default_timer()
        print('Done')
        print('Construction time: {}'.format(t2 - t1))

        self.query_object = table.construct_query_object()
Пример #12
0
 def init_falconn():
     dim = 2048
     # 获得数组
     my_feature = np.load(
         os.path.join(model_path, 'tensorflow-feature.npy'))
     print my_feature.shape
     my_class_name = np.load(
         os.path.join(model_path, 'tensorflow-class_name.npy'))
     print my_class_name.shape
     my_file_path = np.load(
         os.path.join(model_path, 'tensorflow-file_path.npy'))
     print my_file_path.shape
     # 获取数组数量
     trainNum = len(my_feature)
     # 获得默认参数
     p = falconn.get_default_parameters(trainNum, dim)
     t = falconn.LSHIndex(p)
     dataset = my_feature
     # 生成hash
     t.setup(dataset)
     q = t.construct_query_pool()
     return my_feature, my_class_name, my_file_path, q
Пример #13
0
def setup_lsh():

    # extract the signature matrix from database

    con = psycopg2.connect("dbname=yinhan user=yinhan")
    cur = con.cursor()
    cur.execute("SELECT SIGNATURE FROM AKAFINGER")
    lst = cur.fetchall()
    con.commit()
    con.close()

    data = np.array([val[0] for val in lst])
    center = np.mean(data, axis=0)
    data = data - center
    # use the center of the data base to center snippet
    # allegedly to improve the model performance
    params_cp = falconn.get_default_parameters(num_points=data.shape[0],
                                               dimension=data.shape[1])
    table = falconn.LSHIndex(params_cp)
    table.setup(data)

    return center, table.construct_query_object()
Пример #14
0
def init_hash():
    global my_arr, my_id, big_class
    # 获得数组
    my_arr = np.load(os.path.join(path, 'array.npy'))
    my_id = np.load(os.path.join(path, 'id.npy'))
    f = open(os.path.join(path, 'big_class.txt'), 'r')
    a = f.read()
    big_class = eval(a)
    f.close()
    # 获取数组数量
    trainNum = len(my_arr)
    # 获得默认参数
    p = falconn.get_default_parameters(trainNum, dim)
    t = falconn.LSHIndex(p)
    dataset = my_arr
    # 生成hash
    logging.info('Start Hash setup')
    t.setup(dataset)
    if is_pool:
        q = t.construct_query_pool()
    else:
        q = t.construct_query_object()
    return q
Пример #15
0
    def __falconn_fit(self):
        """
        Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
        """

        import falconn

        dimension = self.features.shape[1]
        nb_tables = self.kwargs['nb_tables']
        number_bits = self.kwargs['number_bits']

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables

        # Center the dataset and the queries: this improves the performance of LSH quite a bit.
        self.center = np.mean(self.features, axis=0)
        self.features -= self.center

        # add features to falconn table
        self._falconn_table.setup(self.features)
Пример #16
0
    def build_lsh(self, all_signatures):
        """
		take signatures of songs to build a LSH table, and the query object

		params:
			all_signatures: all signatures from the database
		
		returns:
			a falconn hash table;
			a pointer pointing to the falconn hash table
			None if not successful

		"""

        if all_signatures.shape[0] == 0:
            raise ValueError("All signatures must not be empty.")

        params = falconn.get_default_parameters(all_signatures.shape[0],
                                                all_signatures.shape[1])

        # center the dataset to improve performance:
        all_signatures -= np.mean(all_signatures, axis=0)

        # Create the LSH table
        print('Constructing the LSH table...')
        table = falconn.LSHIndex(params)
        table.setup(all_signatures)

        print('Constructing the queries...')
        query_object = table.construct_query_object()

        self.table = table
        self.query_object = query_object

        if not table or not query_object:
            return None
Пример #17
0
def search(dataset, quer, number):
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(dataset)
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 50
    # we set one rotation, since the data is dense enough,
    # for sparse data set it to 2
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(18, params_cp)
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    query_object = table.construct_query_object()
    number_of_probes = 30000
    query_object.set_num_probes(number_of_probes)
    result = query_object.find_k_nearest_neighbors(query, number)
    return result
Пример #18
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     if self._metric == 'hamming':
         # replace all zeroes by -1
         X[X < 0.5] = -1
     if self._metric == 'angular' or self._metric == 'hamming':
         X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1)
     self._center = numpy.mean(X, axis=0)
     X -= self._center
     self._params = falconn.LSHConstructionParameters()
     self._params.dimension = X.shape[1]
     self._params.distance_function = falconn.DistanceFunction.EuclideanSquared
     self._params.lsh_family = falconn.LSHFamily.CrossPolytope
     falconn.compute_number_of_hash_functions(self._num_bits, self._params)
     self._params.l = self._num_tables
     self._params.num_rotations = 1
     self._params.num_setup_threads = 0
     self._params.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
     self._params.seed = 95225714
     self._index = falconn.LSHIndex(self._params)
     self._index.setup(X)
     self._query_object = self._index.construct_query_object()
     self._query_object.set_num_probes(self._num_probes)
Пример #19
0
#
# Author : fcbruce <*****@*****.**>
#
# Time : Sat 06 May 2017 17:10:14
#
#

import numpy as np
import falconn as fa

a = np.random.randn(50000, 500)
a /= np.linalg.norm(a, axis=1).reshape(-1, 1)

print "pending..."
params_cp = fa.LSHConstructionParameters()
params_cp.dimension = 500
params_cp.lsh_family = 'cross_polytope'
params_cp.distance_function = 'euclidean_squared'
params_cp.l = 7
params_cp.num_rotations = 1
params_cp.seed = 11111
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
fa.compute_number_of_hash_functions(18, params_cp)

table = fa.LSHIndex(params_cp)
table.setup(a)

print "find"
print table.find_nearest_neighbor(a[1])
Пример #20
0
def getLshIndex(para, dataset):
    nnModel = falconn.LSHIndex(para)
    nnModel.setup(dataset)
    print "## sim falconn data setup done. data", dataset.shape, time.asctime()
    return nnModel
Пример #21
0
print('Average query time: {} seconds'.format(average_scan_time))
print(sepline)

# Hyperplane hashing
params_hp = falconn.LSHConstructionParameters()
params_hp.dimension = d
params_hp.lsh_family = 'hyperplane'
params_hp.distance_function = 'negative_inner_product'
params_hp.k = 19
params_hp.l = 10
params_hp.seed = seed ^ 833840234

print('Hyperplane hash\n')

start = timeit.default_timer()
hp_table = falconn.LSHIndex(params_hp)
hp_table.fit(data)
hp_table.set_num_probes(2464)
stop = timeit.default_timer()
hp_construction_time = stop - start

print('k = {}'.format(params_hp.k))
print('l = {}'.format(params_hp.l))
print('Number of probes = {}'.format(hp_table.get_num_probes()))
print('Construction time: {} seconds\n'.format(hp_construction_time))

hp_avg_time, hp_success_prob = run_experiment(hp_table, queries, true_nns)
del hp_table
print(sepline)

# Cross polytope hashing
Пример #22
0
def test_lsh_index_negative():
    n = 1000
    d = 128
    p = falconn.get_default_parameters(n, d)
    t = falconn.LSHIndex(p)
    try:
        t.find_nearest_neighbor(np.random.randn(d))
        assert False
    except RuntimeError:
        pass
    try:
        dataset = [[1.0, 2.0], [3.0, 4.0]]
        t.fit(dataset)
        assert False
    except TypeError:
        pass
    try:
        dataset = np.random.randn(n, d).astype(np.int32)
        t.fit(dataset)
        assert False
    except ValueError:
        pass
    try:
        dataset = np.random.randn(10, 10, 10)
        t.fit(dataset)
        assert False
    except ValueError:
        pass
    dataset = np.random.randn(n, d).astype(np.float32)
    t.fit(dataset)
    dataset = np.random.randn(n, d).astype(np.float64)
    t.fit(dataset)
    u = np.random.randn(d).astype(np.float64)

    try:
        t.find_k_nearest_neighbors(u, 0.5)
        assert False
    except TypeError:
        pass

    try:
        t.find_k_nearest_neighbors(u, -1)
        assert False
    except ValueError:
        pass

    try:
        t.find_near_neighbors(u, -1)
        assert False
    except ValueError:
        pass

    try:
        t.set_max_num_candidates(0.5)
        assert False
    except TypeError:
        pass
    try:
        t.set_max_num_candidates(-10)
        assert False
    except ValueError:
        pass
    t.set_num_probes(t._params.l)
    try:
        t.set_num_probes(t._params.l - 1)
        assert False
    except ValueError:
        pass
    try:
        t.set_num_probes(1000.1)
        assert False
    except TypeError:
        pass

    def check_check_query(f):
        try:
            f(u.astype(np.float32))
            assert False
        except ValueError:
            pass
        try:
            f([0.0] * d)
            assert False
        except TypeError:
            pass
        try:
            f(u[:d - 1])
            assert False
        except ValueError:
            pass
        try:
            f(np.random.randn(d, d))
            assert False
        except ValueError:
            pass

    check_check_query(lambda u: t.find_k_nearest_neighbors(u, 10))
    check_check_query(lambda u: t.find_near_neighbors(u, 0.5))
    check_check_query(lambda u: t.find_nearest_neighbor(u))
    check_check_query(lambda u: t.get_candidates_with_duplicates(u))
    check_check_query(lambda u: t.get_unique_candidates(u))
    check_check_query(lambda u: t.get_unique_sorted_candidates(u))
    t.find_near_neighbors(u, 0.0)
Пример #23
0
def generate_candidate_threshold(entity_embedding=None, data_ids="OpenEA", path="", threshold=0.2, output_path=False,
                                 entity_file="ent_embeds,npy", normalize=True, metric="euclidean", lsh_family="hyperplane", number_of_tables=500):

    """
    :param entity_embedding:
    :param data_ids:
    :param path:
    :param threshold:
    :param output_path:
    :param entity_file:
    :param normalize:
    :param metric:  1.inner 向量的内积, 2.euclidean 欧几里的距离(l2 normaliztion 后与cosine distance 成正比)。
    :param lsh_family:
    :return:
    """

    if entity_embedding is None:
        entity_file_path = path + entity_file
        entity_embedding = np.load(entity_file_path)
        print("Load [%s] successfully!" % (entity_file_path))

    if data_ids is "OpenEA":
        ent2id1, id2ent1, max_id = read_ent_id(path + "kg1_ent_ids")
        ent2id2, id2ent2, max_id = read_ent_id(path + "kg2_ent_ids")
        paths = path.split('/')
        test_path = "/".join([paths[1], paths[2], paths[3], "datasets", paths[7], paths[8], paths[9]])
        test_ids = []
        with open('/' + test_path + r"/test_links", 'r', encoding='utf-8') as f:
            for line in f.readlines():
                items = line.strip().split("\t")
                id1, id2 = int(ent2id1[items[0]]), int(ent2id2[items[1]])
                # maxx_id = max(maxx_id, id1, id2)
                test_ids.append([id1, id2])
        data_ids = test_ids

    if data_ids is "dbp15k":
        # train_ids = read_ids(path+"sup_ent_ids")
        test_ids = read_ids(path + "ref_ent_ids")  # 只考虑测试集上匹配
        # test_ids.extend(train_ids)
        data_ids = test_ids
    data_ids = np.array(data_ids).astype(int)
    entity_embedding = entity_embedding.astype(np.float32)
    if metric == "euclidean":
        entity_embedding -= np.mean(entity_embedding, axis=0)

    Lvec = np.array([entity_embedding[e] for e in data_ids[:, 0]])
    Rvec = np.array([entity_embedding[e] for e in data_ids[:, 1]])
    if os.path.exists(path + "mapping_mat.npy"):   # OpenEA模型转换后的最终向量
        mapping = np.load(path + "mapping_mat.npy")
        #print("mapping shape:", mapping.shape)
        Lvec = np.matmul(Lvec, mapping)
        #print("load mapping succussuflly!")

    if normalize:
        Lvec = preprocessing.normalize(Lvec, norm="l2", axis=1)
        Rvec = preprocessing.normalize(Rvec, norm="l2", axis=1)

    seed = 119417657
    L_True = data_ids[:, 0].tolist()
    print("shape:", entity_embedding.shape)
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = entity_embedding.shape[1]
    if lsh_family == "crosspolytope":
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    elif lsh_family == "hyperplane":
        params_cp.lsh_family = falconn.LSHFamily.Hyperplane
    if metric == "euclidean":
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    elif metric == "inner":
        params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct

    params_cp.l = number_of_tables
    params_cp.num_rotations = 1
    params_cp.seed = seed
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 2
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(20, params_cp)
    # print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    table.setup(Lvec)
    t2 = timeit.default_timer()

    print('Construction time: {}'.format(t2 - t1))
    query_object = table.construct_query_object()
    number_of_probes = number_of_tables
    print('Choosing number of probes: ', number_of_probes)
    query_object.set_num_probes(number_of_probes)
    t1 = timeit.default_timer()
    true_cnt = 0
    total = 0
    true_all = data_ids.shape[0]
    node_pairs = []
    print("Metric:", metric, "Threshold:", threshold)
    for ids_index, pair in enumerate(data_ids):
        ans = query_object.find_near_neighbors(Rvec[ids_index], threshold=threshold)
        #print(len(ans))
        for index in range(len(ans)):
            if pair[0] == L_True[ans[index]]:
                true_cnt += 1
                node_pairs.append((pair[0], pair[1], 1))
            else:
                node_pairs.append((L_True[ans[index]], pair[1], 0))
        total += len(ans)
    print('Threshold:[%f] True cnt:[%d] Generate All cnt:[%d] Total:[%d] Recall:[%f] P/E ratio:[%f] Metric:[%s]'
          % (threshold, true_cnt, total, true_all, true_cnt/true_all, total/true_all, metric))

    t2 = timeit.default_timer()
    print('Generate Candidate time: {}'.format(t2 - t1))
    if output_path == True:
        output_path = "/".join(path.split('/')[:-1]) + '/topk_' + str(threshold) + '_name_ngram'
        print('output path:', output_path)
        with open(output_path, 'w', encoding='utf8') as f:
            for pair in node_pairs:
                f.writelines(pair[0] + '\t' + pair[1] + '\t' + str(pair[2]) + '\n')
Пример #24
0

import numpy as np
import falconn

if __name__ == '__main__':
    a1 = np.load('outputs_1.npy')
    a2 = np.load('outputs_2.npy')
    y = np.load('labels.npy')
    print(y.shape)

    a = np.r_[a1, a2]
    n, d = a.shape

    p = falconn.get_default_parameters(n, d)
    t = falconn.LSHIndex(p)
    dataset = a
    t.setup(dataset)

    Q = t.construct_query_object()

    # input
    i, k = 4545, 100
    print(i, k)
    while (True):
        i, k = map(int, input().split())
        q = a[i:i + 1, :]
        u = q.sum(axis=0)

        ans = Q.find_k_nearest_neighbors(u, k)
        print(ans)
import falconn

# In[ ]:

parameters = falconn.LSHConstructionParameters()
num_tables = 1
parameters.l = num_tables
parameters.dimension = num_dimensions
parameters.distance_function = falconn.DistanceFunction.EuclideanSquared
parameters.lsh_family = falconn.LSHFamily.CrossPolytope
parameters.num_rotations = 1
parameters.num_setup_threads = 1
parameters.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
falconn.compute_number_of_hash_functions(16, parameters)

index = falconn.LSHIndex(parameters)
get_ipython().run_line_magic('time', 'index.setup(dataset)')

query_object = index.construct_query_object()
num_probes = 1
query_object.set_num_probes(num_probes)

get_ipython().run_line_magic(
    'timeit', 'query_object.find_k_nearest_neighbors(query, 5)')

# In[ ]:

query = dataset[5000]
print(query_object.find_k_nearest_neighbors(query, 5))
Пример #26
0
    def set_clustering_LSH_Index(self, number_of_queries, query_accuracy, number_of_tables, hash_bit):
        #Function defintion: Returns the LSH Index -- Read LSH for more information or
        # README.2

        #parameters
        #number_of_queries:The number of queries used to determine the number_of_probes
        #query_accuracy: Specifies the level of accuracy of the Index
        #Setting query_accuracy = 1 degenerates LSH index into linear search.
        #number_of_tables:the number of hash_tables used for a given nearest
        #neighbor search
        #hash_bit: Used to determine the number of hash functions. READ_ME for detail.
        print("Setting Clustering Index")

        queries = self.w2v_vectors[(len(self.w2v_vectors)-number_of_queries):]
        w2v_vectors = self.w2v_vectors[:(len(self.w2v_vectors)-number_of_queries)]

        #Normalize vectors
        center = np.mean(w2v_vectors, axis=0)
        w2v_vectors -= center
        queries -= center


        #perform linear scan to return correct answers
        answers = self.linearScan_answerGenerator(w2v_vectors, queries)

        #Set number of probes----
        print('Choosing number of probes')
        init_number_of_probes = 600
        # END -------

        #Parameters -----
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(w2v_vectors[0]) # = 50 for Glove6B.50d
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(hash_bit, params_cp)
        # END ------

        #Constructing LSH Index -----
        print('Constructing the LSH Index')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        table.setup(w2v_vectors)
        t2 = timeit.default_timer()
        print('Done')
        print('Construction time: {}'.format(t2 - t1))
        query_object = table.construct_query_object()
        number_of_probes = self.probeGenerator(query_accuracy, init_number_of_probes, query_object, answers, queries, number_of_tables)
        query_object.set_num_probes(number_of_probes)
        #--------

        # Performance Statistics
        t1 = timeit.default_timer()
        score = 0
        for (i, query) in enumerate(queries):
            if query_object.find_nearest_neighbor(query) == answers[i]:
                score += 1
        t2 = timeit.default_timer()
        print('Query time: {}'.format((t2 - t1) / len(queries)))
        print('Precision: {}'.format(float(score) / len(queries)))
        self.query_object = query_object
        print("Vectors Successfully Hashed. Clustering LSH Index Created")
Пример #27
0
    params_cp.l = number_of_tables
    # we set one rotation, since the data is dense enough,
    # for sparse data set it to 2
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(18, params_cp)

    print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    t2 = timeit.default_timer()
    print('Done')
    print('Construction time: {}'.format(t2 - t1))

    # find the smallest number of probes to achieve accuracy 0.9
    # using the binary search
    print('Choosing number of probes')
    number_of_probes = number_of_tables
    def evaluate_number_of_probes(number_of_probes):
        table.set_num_probes(number_of_probes)
        score = 0
        for (i, query) in enumerate(queries):
            if answers[i] in table.get_candidates_with_duplicates(query):
                score += 1
Пример #28
0
def test_lsh_index_negative():
    p = falconn.get_default_parameters(n, d)
    try:
        t = falconn.LSHIndex(p)
        t.construct_query_object()
        assert False
    except RuntimeError:
        pass
    try:
        t = falconn.LSHIndex(p)
        t.setup([[1.0, 2.0], [3.0, 4.0]])
        assert False
    except TypeError:
        pass
    try:
        t = falconn.LSHIndex(p)
        t.setup(np.random.randn(n, d).astype(np.int32))
        assert False
    except TypeError:
        pass
    try:
        t = falconn.LSHIndex(p)
        t.setup(np.random.randn(10, 10, 10))
        assert False
    except ValueError:
        pass
    try:
        t = falconn.LSHIndex(p)
        t.setup(np.random.randn(n, d))
        t.setup(np.random.randn(n, d))
        assert False
    except RuntimeError:
        pass
    for (t1, t2) in [(np.float32, np.float64), (np.float64, np.float32)]:
        for g in [
                lambda t: t.construct_query_object(),
                lambda t: t.construct_query_pool()
        ]:
            t = falconn.LSHIndex(p)
            t.setup(np.random.randn(n, d).astype(t1))
            q = g(t)
            u = np.random.randn(d).astype(t1)

            try:
                q.find_k_nearest_neighbors(u, 0.5)
                assert False
            except TypeError:
                pass

            try:
                q.find_k_nearest_neighbors(u, -1)
                assert False
            except ValueError:
                pass

            try:
                q.find_near_neighbors(u, -1)
                assert False
            except ValueError:
                pass

            try:
                q.set_max_num_candidates(0.5)
                assert False
            except TypeError:
                pass
            try:
                q.set_max_num_candidates(-10)
                assert False
            except ValueError:
                pass
            q.set_num_probes(t._params.l)
            try:
                q.set_num_probes(t._params.l - 1)
                assert False
            except ValueError:
                pass
            try:
                q.set_num_probes(1000.1)
                assert False
            except TypeError:
                pass

            def check_check_query(f):
                try:
                    f(u.astype(t2))
                    assert False
                except TypeError:
                    pass
                try:
                    f([0.0] * d)
                    assert False
                except TypeError:
                    pass
                try:
                    f(u[:d - 1])
                    assert False
                except ValueError:
                    pass
                try:
                    f(np.random.randn(d, d))
                    assert False
                except ValueError:
                    pass

            check_check_query(lambda u: q.find_k_nearest_neighbors(u, 10))
            check_check_query(lambda u: q.find_near_neighbors(u, 0.5))
            check_check_query(lambda u: q.find_nearest_neighbor(u))
            check_check_query(lambda u: q.get_candidates_with_duplicates(u))
            check_check_query(lambda u: q.get_unique_candidates(u))
Пример #29
0
import falconn 
par = falconn.LSHConstructionParameters()
param = falconn.get_default_parameters(num_points = len(train), dimension = len(train[0]), distance = falconn.DistanceFunction.EuclideanSquared )
print(param.lsh_family, param.l, param.k)
tables = param.l
hashes = param.k
param.l = int(1.1*tables)
para = []

for k in [hashes,int(hashes*1.5)]:
    param.k = k
    lsh = falconn.LSHIndex(param)
    lsh.setup(train)
      
    startClock = time.clock()
    startTime = process_time()
    indexlsh = lsh.construct_query_object()
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
  
    for t in [param.l, int(param.l*2), int(param.l*3)]:
        indexlsh.set_num_probes(t)
        
        print('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t))
        
        rez = []
        for q in qry:
            startClock = time.clock()
            startTime = process_time()
Пример #30
0
        #

        params_cp_blue = falconn.LSHConstructionParameters()
        params_cp_blue.dimension = len(dataset_blue[0])
        params_cp_blue.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp_blue.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp_blue.l = number_of_tables
        params_cp_blue.num_rotations = 1
        params_cp_blue.seed = 666666
        params_cp_blue.num_setup_threads = 1
        params_cp_blue.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(20, params_cp_blue)

        print('Constructing the LSH table')
        t1 = timeit.default_timer()
        table_blue = falconn.LSHIndex(params_cp_blue)
        table_blue.setup(dataset_blue)
        t2 = timeit.default_timer()
        query_object_blue = table_blue.construct_query_object()
        print('Done')
        print('Construction time: {}'.format((t2 - t1)))

        params_cp_green = falconn.LSHConstructionParameters()
        params_cp_green.dimension = len(dataset_green[0])
        params_cp_green.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp_green.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp_green.l = number_of_tables
        params_cp_green.num_rotations = 1
        params_cp_green.seed = 666666
        params_cp_green.num_setup_threads = 1
        params_cp_green.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable