def test_lsh_index_positive(): p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) dataset = np.random.randn(n, d).astype(np.float32) t.setup(dataset) def is_int(x): try: res = isinstance(x, (int, long)) return res except NameError: res = isinstance(x, int) return res def test_positive(q): u = np.random.randn(d).astype(np.float32) assert isinstance(q.find_k_nearest_neighbors(u, 10), list) assert isinstance(q.find_near_neighbors(u, 10.0), list) assert is_int(q.find_nearest_neighbor(u)) assert isinstance(q.get_candidates_with_duplicates(u), list) assert is_int(q.get_max_num_candidates()) assert is_int(q.get_num_probes()) assert isinstance(q.get_query_statistics(), falconn.QueryStatistics) assert isinstance(q.get_unique_candidates(u), list) assert q.reset_query_statistics() is None assert q.set_max_num_candidates(100) is None assert q.set_num_probes(10) is None q = t.construct_query_object() test_positive(q) q = t.construct_query_pool() test_positive(q)
def setup_hash_tables(self, data, threads=0, probes=50): """Creates hash tables for an efficient approximate nearest neighbor search Args: data (numpy.ndarray): matrix where each row is a unique vector threads (int): the number of threads desired to setup the Locality Sensitive Hash hash tables. If the number of threads is 0 the maximum number of available hardware threads found will be used up to the number of hash tables 10. 0 is selected by default. probes (int): the number of probes each query will make over all the hash tables. (The higher number of probes the more accurate the search, but the longer it will take [Needs Verification]). Returns query object from falconn to search the created table. """ import falconn params = falconn.get_default_parameters(data.shape[0], len(self.seed)) params.num_setup_threads = threads table = falconn.LSHIndex(params) table.setup(data) query = table.construct_query_object() query.set_num_probes(probes) return query
def __init__(self, feature_file, label_file, port, worker_num=10): self.url_worker = 'inproc://ping-workers' url_router = "tcp://*:%s" % port self.worker_num = worker_num self.worker_counts = Array('i', [0] * worker_num) self.context = zmq.Context() self.router = self.context.socket(zmq.ROUTER) self.router.bind(url_router) self.workers = self.context.socket(zmq.DEALER) self.workers.bind(self.url_worker) self.label = np.load(label_file) logger.info("start load feature data") t1 = time.time() self.feature = np.load(feature_file) t2 = time.time() logger.info("load cost time:%f" % (t2 - t1)) dp = fc.get_default_parameters(self.feature.shape[0], self.feature.shape[1], fc.DistanceFunction.EuclideanSquared) ds = fc.LSHIndex(dp) train_st = time.time() ds.setup(self.feature) train_et = time.time() logger.info("train cost time:%f" % (train_et - train_st)) # self.qo = ds.construct_query_object() self.qp = ds.construct_query_pool()
def set_cp(data): """ d = 128 seed = 119417657 # Cross polytope hashing params_cp = falconn.LSHConstructionParameters() params_cp.dimension = d params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct params_cp.storage_hash_table = falconn.StorageHashTable.FlatHashTable params_cp.k = 3 params_cp.l = 10 params_cp.num_setup_threads = 0 params_cp.last_cp_dimension = 16 params_cp.num_rotations = 3 params_cp.seed = seed ^ 833840234 """ num_points, dim = data.shape parms = falconn.get_default_parameters(num_points, dim) falconn.compute_number_of_hash_functions(7, parms) cp_table = falconn.LSHIndex(parms) cp_table.setup(data) qo = cp_table.construct_query_object() qo.set_num_probes(896) return qo
def get_clusters_falconn(self): serializable_list = [] vector_numpy_ndarray = np.array(self.vector_matrix) vector_numpy_ndarray /= np.linalg.norm(vector_numpy_ndarray).reshape(-1, 1) center = np.mean(vector_numpy_ndarray) vector_numpy_ndarray -= center falconn_params = falconn.get_default_parameters(len(self.vector_matrix), len(self.vector_matrix[0])) falconn_params.distance_function = "euclidean_squared" lsh_index = falconn.LSHIndex(falconn_params) lsh_index.setup(vector_numpy_ndarray) i = 0 for vector in self.vector_matrix: cluster = lsh_index.find_near_neighbors(np.array(vector), self.similarity_threshold) cluster = cluster + (i,) i += 1 if len(cluster) < 2: continue similarity_cluster = SimilarityCluster(self.similarity_threshold, self.vector_id_list[cluster[0]], self.vector_matrix[cluster[0]], self.start_time_ms, self.end_time_ms) for index in cluster: if index == cluster[0]: continue similarity_cluster.similar_image_ids.append(self.vector_id_list[index]) similarity_cluster.apply_vector_to_average(self.vector_matrix[index]) serializable_list.append(similarity_cluster.to_serializable_object()) return serializable_list
def retrival(self, query, dataset=None, *, k=None, threshold=None): if dataset is None: table = self.last_table else: hashint = xxhash.xxh64(dataset[:, 0].copy(), self.seed).intdigest() if hashint in self.tables: table = self.tables[hashint] else: print('find a new dataset') dataset = dataset.astype(np.float32) mean = np.mean(dataset, axis=0) dataset -= mean params = falconn.get_default_parameters( dataset.shape[0], dataset.shape[1]) falconn.compute_number_of_hash_functions(7, params) lsh_index = falconn.LSHIndex(params) lsh_index.setup(dataset) qtable = lsh_index.construct_query_object() qtable.set_num_probes(10000) table = (mean, qtable) self.tables[hashint] = table if table is None: raise Exception("Dataset not specific") query -= table[0] if k is not None and threshold is not None: raise ValueError("k and threshold should not pass simultaneously") self.last_table = table if k is not None: return table[1].find_k_nearest_neighbors(query, k) if threshold is not None: return table[1].find_near_neighbors(query, threshold) return table[1].find_nearest_neighbor(query)
def runTest(): m_bad = 0 m_right = 0 m_num = 0 for main_times in range(0, times): if resetTest: resetRandom() test = np.load(os.path.join(path, test_file_name)) train = np.load(os.path.join(path, train_file_name)) testNum = len(test) trainNum = len(train) p = falconn.get_default_parameters(trainNum, dim) t = falconn.LSHIndex(p) dataset = [np.ravel(x[0]).astype(np.float32) for x in train] print len(dataset) dataset = np.array(dataset) t.setup(dataset) if is_pool: q = t.construct_query_pool() else: q = t.construct_query_object() t2 = time.time() for i in test: t1 = time.time() #print(i) i[0] = np.ravel(i[0]) tList = train[q.find_k_nearest_neighbors(i[0], k)] is_true = False for l in tList: if is_big_key: if ks[l[1]] == ks[i[1]]: is_true = True break else: if l[1] == i[1]: is_true = True break if is_true: m_right += 1 else: m_bad += 1 if is_log: if is_big_key: logging.error('###### Bad %s(%s: %s) with %s' (ks[i[1]], i[1], i[2], tList)) else: logging.error('###### Bad %s: %s with %s' (i[1], i[2], tList)) m_num += 1 if m_num % reportTime == 1: logging.info('Last accuracy: %.2f %%' % (m_right / float(m_num) * 100.0)) logging.info('Last loss: %.2f %%' % (m_bad / float(m_num) * 100.0)) logging.info('right: %d bad: %d now: %d/%d Time: %.5fs/1iter' % (m_right, m_bad, m_num, testNum * times, (time.time() - t1))) logging.info('Speed Time: %.8f' % ((time.time() - t2) / testNum)) logging.info('Last accuracy: %.2f %% (%d/%d)' % ((m_right / float(m_num) * 100.0), m_right, m_num)) logging.info('Last loss: %.2f %% (%d/%d)' % ((m_bad / float(m_num) * 100.0), m_bad, m_num)) logging.info('End Run Test')
def hash_construct(self, features): dp = fc.get_default_parameters(features.shape[0], features.shape[1], fc.DistanceFunction.EuclideanSquared) dp.l = 20 ds = fc.LSHIndex(dp) train_st = time.time() ds.setup(features) train_et = time.time() print("### hash train time:%f" % (train_et - train_st)) return ds.construct_query_object()
def test_get_default_parameters(): n = 100000 dim = 128 dist_func = 'negative_inner_product' params = falconn.get_default_parameters(n, dim, dist_func, True) assert params.l == 10 assert params.lsh_family == 'cross_polytope' assert params.k == 2 assert params.dimension == dim assert params.distance_function == dist_func assert params.num_rotations == 1 assert params.last_cp_dimension == 64
def hashing(hash_input): """ Usage: generate hash code for static dataset :param hash_input: a list that has two dimensions. :return: a pointer, pointing to a falconn hash table. """ parameters = falconn.get_default_parameters(len(hash_input), len(hash_input[0])) lsh = falconn.LSHIndex(parameters) lsh.setup(hash_input) query_table = lsh.construct_query_object() return query_table
def main(): parser = argparse.ArgumentParser() mode = parser.add_mutually_exclusive_group() mode.add_argument("--database", metavar="FILENAME", default="database.txt") mode.add_argument("--numpy-database", metavar="FILENAME") parser.add_argument("--test-vector", metavar="FILENAME", default="test-vector.txt") parser.add_argument("--limit", metavar="MAX", type=int, default=-1) parser.add_argument( "--params", choices=("hyperplane", "crosspolytope", "default"), default="default", ) parser.add_argument("--probes", type=int, default=2464) args = parser.parse_args() start_read_db = time.monotonic_ns() if args.numpy_database: db, mean = read_numpy_database(args.numpy_database) else: db = read_database(args.database, args.limit) db = db.astype(np.float32) mean = np.mean(db, axis=0) db -= mean save_numpy_database(db, mean, args.database) end_read_db = time.monotonic_ns() print( f"Reading database {(end_read_db-start_read_db) / 1000000.0:.3f}ms", file=sys.stderr, ) num_points = len(db) dimensions = len(db[0]) if args.params == "default": index_params = falconn.get_default_parameters(num_points=num_points, dimension=dimensions) elif args.params == "hyperplane": index_params = hyperplane_hashing_params(dimensions=dimensions) elif args.params == "crosspolytope": index_params = cross_polytope_hashing_params(dimensions=dimensions) else: raise ValueError(f"Unknown params: {args.params}") test_queries(db, args.probes, mean, index_params, args.test_vector)
def test_get_default_parameters(): n = 100000 dim = 128 dist_func = 'negative_inner_product' params = falconn.get_default_parameters(n, dim, dist_func, True) assert params.l == 10 assert params.lsh_family == 'cross_polytope' assert params.storage_hash_table == 'bit_packed_flat_hash_table' assert params.num_setup_threads == 0 assert params.k == 2 assert params.dimension == dim assert params.distance_function == dist_func assert params.num_rotations == 1 assert params.last_cp_dimension == 64
def add(self, vecs): self.center = np.mean(vecs, axis=0) # Subtract mean vector later self.params_cp = falconn.get_default_parameters( num_points=vecs.shape[0], dimension=vecs.shape[1], distance=falconn.DistanceFunction.EuclideanSquared, is_sufficiently_dense=True) # self.params_cp.num_setup_threads = 0 # Single thread mode bit = int(np.round(np.log2(vecs.shape[0]))) falconn.compute_number_of_hash_functions(bit, self.params_cp) self.table = falconn.LSHIndex(self.params_cp) self.table.setup(vecs - self.center) self.query_object = self.table.construct_query_object()
def test_get_default_parameters(): n = 100000 dim = 128 dist_func = "negative_inner_product" params = falconn.get_default_parameters(n, dim, dist_func, True) assert params.l == 10 assert params.lsh_family == "cross_polytope" assert params.storage_hash_table == "bit_packed_flat_hash_table" assert params.num_setup_threads == 0 assert params.k == 2 assert params.dimension == dim assert params.distance_function == dist_func assert params.num_rotations == 1 assert params.last_cp_dimension == 64
def init_hash(): # 获得数组 train=np.array(load_all_beOne(path)) # 获取数组数量 trainNum=len(train) # 获得默认参数 p=falconn.get_default_parameters(trainNum, dim) t=falconn.LSHIndex(p) dataset=[np.ravel(x[0]).astype(np.float32) for x in train] dataset=np.array(dataset) # 生成hash logging.info('Start Hash setup') t.setup(dataset) if is_pool: q=t.construct_query_pool() else: q=t.construct_query_object() return (q, train)
def main(): iris = load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y) params = get_default_parameters(X_train.shape[0], X_train.shape[1]) lsh_index = LSHIndex(params) lsh_index.setup(X_train) lsh_query = lsh_index.construct_query_object() x = X_test[0] results = lsh_query.find_k_nearest_neighbors(x, 3) print(y_test[0]) print(results) print(y_train[results]) print('DONE')
def fit(self, X: np.ndarray, y: np.ndarray = None): """ Setup the LSH index from training data. Parameters ---------- X: np.array Data to be indexed y: any Ignored Returns ------- self: FalconnLSH An instance of LSH with a built index """ X = check_array(X, dtype=[np.float32, np.float64]) if self.metric in ['euclidean', 'l2', 'minkowski']: self.metric = 'euclidean' distance = falconn.DistanceFunction.EuclideanSquared elif self.metric in ['squared_euclidean', 'sqeuclidean']: self.metric = 'sqeuclidean' distance = falconn.DistanceFunction.EuclideanSquared elif self.metric in ['cosine', 'NegativeInnerProduct', 'neg_inner']: self.metric = 'cosine' distance = falconn.DistanceFunction.NegativeInnerProduct else: warnings.warn( f'Invalid metric "{self.metric}". Using "euclidean" instead') self.metric = 'euclidean' distance = falconn.DistanceFunction.EuclideanSquared # Set up the LSH index lsh_construction_params = falconn.get_default_parameters( *X.shape, distance=distance) lsh_index = falconn.LSHIndex(lsh_construction_params) lsh_index.setup(X) self.X_train_ = X self.y_train_ = y self.index_ = lsh_index return self
def __init__(self, feature_file, label_file, id_feature_file, id_label_file): self.idfeature = np.load(id_feature_file) self.idlabel = np.load(id_label_file) self.label = np.load(label_file) print "start load feature data" t1 = time.time() feature = np.load(feature_file) t2 = time.time() print("load cost time:%f" % (t2 - t1)) dp = fc.get_default_parameters(feature.shape[0], feature.shape[1], fc.DistanceFunction.EuclideanSquared) ds = fc.LSHIndex(dp) train_st = time.time() ds.setup(feature) train_et = time.time() print("train cost time:%f" % (train_et - train_st)) self.qo = ds.construct_query_object()
def load_identifier(self,labelFile,featuresFile): self.label = np.load( labelFile) print "start load feature data" print(labelFile) t1 = time.time() self.feature = np.load(featuresFile) self.embs = self.feature print ("feature dtype:%d", self.feature.dtype) t2 = time.time() print ("load cost time:%f" % (t2 - t1)) self.dp = fc.get_default_parameters(self.feature.shape[0], self.feature.shape[1], fc.DistanceFunction.EuclideanSquared) self.dp.l = 30 self.ds = fc.LSHIndex(self.dp) train_st = time.time() self.ds.setup(self.feature) train_et = time.time() print ("train cost time:%f" % (train_et - train_st)) self.qo = self.ds.construct_query_object()
def test_lsh_index_positive(): n = 1000 d = 128 p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) dataset = np.random.randn(n, d).astype(np.float32) t.fit(dataset) u = np.random.randn(d).astype(np.float32) t.find_k_nearest_neighbors(u, 10) t.find_near_neighbors(u, 10.0) t.find_nearest_neighbor(u) t.get_candidates_with_duplicates(u) t.get_max_num_candidates() t.get_num_probes() t.get_query_statistics() t.get_unique_candidates(u) t.get_unique_sorted_candidates(u) t.reset_query_statistics() t.set_max_num_candidates(100) t.set_num_probes(10)
def test_lsh_index_positive(): n = 1000 d = 128 p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) dataset = np.random.randn(n, d).astype(np.float32) t.setup(dataset) u = np.random.randn(d).astype(np.float32) t.find_k_nearest_neighbors(u, 10) t.find_near_neighbors(u, 10.0) t.find_nearest_neighbor(u) t.get_candidates_with_duplicates(u) t.get_max_num_candidates() t.get_num_probes() t.get_query_statistics() t.get_unique_candidates(u) #t.get_unique_sorted_candidates(u) t.reset_query_statistics() t.set_max_num_candidates(100) t.set_num_probes(10)
def lsh_sieve(full_deltas, d, n): deltas = np.reshape(full_deltas, (n, d)) centred_deltas = (deltas - np.mean(deltas, axis=0)) params = falconn.get_default_parameters(n, d) fln = falconn.LSHIndex(params) fln.setup(centred_deltas) qob = fln.construct_query_object() # Greedy merge within a distance # all_sets = list() full_grad = np.zeros(d) for i in range(n): neighbors = qob.find_near_neighbors(centred_deltas[i], 1.0 / d) # print str(i) + " has " + str(neighbors) full_grad = full_grad + (deltas[i] / len(neighbors)) return full_grad
def setup_lsh(): # extract the signature matrix from database con = psycopg2.connect("dbname=yinhan user=yinhan") cur = con.cursor() cur.execute("SELECT SIGNATURE FROM AKAFINGER") lst = cur.fetchall() con.commit() con.close() data = np.array([val[0] for val in lst]) center = np.mean(data, axis=0) data = data - center # use the center of the data base to center snippet # allegedly to improve the model performance params_cp = falconn.get_default_parameters(num_points=data.shape[0], dimension=data.shape[1]) table = falconn.LSHIndex(params_cp) table.setup(data) return center, table.construct_query_object()
def init_falconn(): dim = 2048 # 获得数组 my_feature = np.load( os.path.join(model_path, 'tensorflow-feature.npy')) print my_feature.shape my_class_name = np.load( os.path.join(model_path, 'tensorflow-class_name.npy')) print my_class_name.shape my_file_path = np.load( os.path.join(model_path, 'tensorflow-file_path.npy')) print my_file_path.shape # 获取数组数量 trainNum = len(my_feature) # 获得默认参数 p = falconn.get_default_parameters(trainNum, dim) t = falconn.LSHIndex(p) dataset = my_feature # 生成hash t.setup(dataset) q = t.construct_query_pool() return my_feature, my_class_name, my_file_path, q
def init_hash(): global my_arr, my_id, big_class # 获得数组 my_arr = np.load(os.path.join(path, 'array.npy')) my_id = np.load(os.path.join(path, 'id.npy')) f = open(os.path.join(path, 'big_class.txt'),'r') a = f.read() big_class = eval(a) f.close() # 获取数组数量 trainNum=len(my_arr) # 获得默认参数 p=falconn.get_default_parameters(trainNum, dim) t=falconn.LSHIndex(p) dataset = my_arr # 生成hash logging.info('Start Hash setup') t.setup(dataset) if is_pool: q=t.construct_query_pool() else: q=t.construct_query_object() return q
def init_hash(): global my_arr, my_id, big_class # 获得数组 my_arr = np.load(os.path.join(path, 'array.npy')) my_id = np.load(os.path.join(path, 'id.npy')) f = open(os.path.join(path, 'big_class.txt'), 'r') a = f.read() big_class = eval(a) f.close() # 获取数组数量 trainNum = len(my_arr) # 获得默认参数 p = falconn.get_default_parameters(trainNum, dim) t = falconn.LSHIndex(p) dataset = my_arr # 生成hash logging.info('Start Hash setup') t.setup(dataset) if is_pool: q = t.construct_query_pool() else: q = t.construct_query_object() return q
def build_lsh(self, all_signatures): """ take signatures of songs to build a LSH table, and the query object params: all_signatures: all signatures from the database returns: a falconn hash table; a pointer pointing to the falconn hash table None if not successful """ if all_signatures.shape[0] == 0: raise ValueError("All signatures must not be empty.") params = falconn.get_default_parameters(all_signatures.shape[0], all_signatures.shape[1]) # center the dataset to improve performance: all_signatures -= np.mean(all_signatures, axis=0) # Create the LSH table print('Constructing the LSH table...') table = falconn.LSHIndex(params) table.setup(all_signatures) print('Constructing the queries...') query_object = table.construct_query_object() self.table = table self.query_object = query_object if not table or not query_object: return None
def getPara_forLsh(datasetShape): num_points, dim = datasetShape para = falconn.get_default_parameters(num_points, dim) para.distance_function = "euclidean_squared" # vanilla eu return para
def test_lsh_index_negative(): n = 1000 d = 128 p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) try: t.find_nearest_neighbor(np.random.randn(d)) assert False except RuntimeError: pass try: dataset = [[1.0, 2.0], [3.0, 4.0]] t.fit(dataset) assert False except TypeError: pass try: dataset = np.random.randn(n, d).astype(np.int32) t.fit(dataset) assert False except ValueError: pass try: dataset = np.random.randn(10, 10, 10) t.fit(dataset) assert False except ValueError: pass dataset = np.random.randn(n, d).astype(np.float32) t.fit(dataset) dataset = np.random.randn(n, d).astype(np.float64) t.fit(dataset) u = np.random.randn(d).astype(np.float64) try: t.find_k_nearest_neighbors(u, 0.5) assert False except TypeError: pass try: t.find_k_nearest_neighbors(u, -1) assert False except ValueError: pass try: t.find_near_neighbors(u, -1) assert False except ValueError: pass try: t.set_max_num_candidates(0.5) assert False except TypeError: pass try: t.set_max_num_candidates(-10) assert False except ValueError: pass t.set_num_probes(t._params.l) try: t.set_num_probes(t._params.l - 1) assert False except ValueError: pass try: t.set_num_probes(1000.1) assert False except TypeError: pass def check_check_query(f): try: f(u.astype(np.float32)) assert False except ValueError: pass try: f([0.0] * d) assert False except TypeError: pass try: f(u[:d - 1]) assert False except ValueError: pass try: f(np.random.randn(d, d)) assert False except ValueError: pass check_check_query(lambda u: t.find_k_nearest_neighbors(u, 10)) check_check_query(lambda u: t.find_near_neighbors(u, 0.5)) check_check_query(lambda u: t.find_nearest_neighbor(u)) check_check_query(lambda u: t.get_candidates_with_duplicates(u)) check_check_query(lambda u: t.get_unique_candidates(u)) check_check_query(lambda u: t.get_unique_sorted_candidates(u)) t.find_near_neighbors(u, 0.0)
import falconn par = falconn.LSHConstructionParameters() param = falconn.get_default_parameters(num_points = len(train), dimension = len(train[0]), distance = falconn.DistanceFunction.EuclideanSquared ) print(param.lsh_family, param.l, param.k) tables = param.l hashes = param.k param.l = int(1.1*tables) para = [] for k in [hashes,int(hashes*1.5)]: param.k = k lsh = falconn.LSHIndex(param) lsh.setup(train) startClock = time.clock() startTime = process_time() indexlsh = lsh.construct_query_object() end_time = process_time() constructionTime = end_time - startTime endClock = time.clock() constructionClock= endClock - startClock for t in [param.l, int(param.l*2), int(param.l*3)]: indexlsh.set_num_probes(t) print('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t)) rez = [] for q in qry: startClock = time.clock() startTime = process_time()
print(y2[:k]) import numpy as np import falconn if __name__ == '__main__': a1 = np.load('outputs_1.npy') a2 = np.load('outputs_2.npy') y = np.load('labels.npy') print(y.shape) a = np.r_[a1, a2] n, d = a.shape p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) dataset = a t.setup(dataset) Q = t.construct_query_object() # input i, k = 4545, 100 print(i, k) while (True): i, k = map(int, input().split()) q = a[i:i + 1, :] u = q.sum(axis=0) ans = Q.find_k_nearest_neighbors(u, k)
# print(dataBaseInitial) dataBase = np.array(dataBaseInitial.iloc[:, 1:dimension], dtype="float32") queryBase = queryBaseInitial.iloc[:, 1:dimension] # print(dataBase.shape) # params_cp = falconn.LSHConstructionParameters() # params_cp.dimension = len(dataBase[0]) # params_cp.lsh_family = falconn.LSHFamily.CrossPolytope # params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared # params_cp.l = 100 # params_cp.k = 100 # params_cp.num_setup_threads = 1 # params_cp.storage_hash_table = falconn.StorageHashTable.LinearProbingHashTable # params_cp.num_rotations = 2 params_cp = falconn.get_default_parameters(len(dataBase), len(dataBase[0])) falconn.compute_number_of_hash_functions(18, params_cp) table = falconn.LSHIndex(params_cp) table.setup(dataBase) query_object = table.construct_query_object() number_of_probes = params_cp.l query_object.set_num_probes(number_of_probes) print('FALCONN方案:') res = [] for k in range(10, 1010, 10): print('k={}'.format(k)) begin_time = time() for m in range(queryBase.shape[0]):
def test_lsh_index_negative(): p = falconn.get_default_parameters(n, d) try: t = falconn.LSHIndex(p) t.construct_query_object() assert False except RuntimeError: pass try: t = falconn.LSHIndex(p) t.setup([[1.0, 2.0], [3.0, 4.0]]) assert False except TypeError: pass try: t = falconn.LSHIndex(p) t.setup(np.random.randn(n, d).astype(np.int32)) assert False except TypeError: pass try: t = falconn.LSHIndex(p) t.setup(np.random.randn(10, 10, 10)) assert False except ValueError: pass try: t = falconn.LSHIndex(p) t.setup(np.random.randn(n, d)) t.setup(np.random.randn(n, d)) assert False except RuntimeError: pass for (t1, t2) in [(np.float32, np.float64), (np.float64, np.float32)]: for g in [ lambda t: t.construct_query_object(), lambda t: t.construct_query_pool() ]: t = falconn.LSHIndex(p) t.setup(np.random.randn(n, d).astype(t1)) q = g(t) u = np.random.randn(d).astype(t1) try: q.find_k_nearest_neighbors(u, 0.5) assert False except TypeError: pass try: q.find_k_nearest_neighbors(u, -1) assert False except ValueError: pass try: q.find_near_neighbors(u, -1) assert False except ValueError: pass try: q.set_max_num_candidates(0.5) assert False except TypeError: pass try: q.set_max_num_candidates(-10) assert False except ValueError: pass q.set_num_probes(t._params.l) try: q.set_num_probes(t._params.l - 1) assert False except ValueError: pass try: q.set_num_probes(1000.1) assert False except TypeError: pass def check_check_query(f): try: f(u.astype(t2)) assert False except TypeError: pass try: f([0.0] * d) assert False except TypeError: pass try: f(u[:d - 1]) assert False except ValueError: pass try: f(np.random.randn(d, d)) assert False except ValueError: pass check_check_query(lambda u: q.find_k_nearest_neighbors(u, 10)) check_check_query(lambda u: q.find_near_neighbors(u, 0.5)) check_check_query(lambda u: q.find_nearest_neighbor(u)) check_check_query(lambda u: q.get_candidates_with_duplicates(u)) check_check_query(lambda u: q.get_unique_candidates(u))
def test_lsh_index_negative(): n = 1000 d = 128 p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) try: t.find_nearest_neighbor(np.random.randn(d)) assert False except RuntimeError: pass try: dataset = [[1.0, 2.0], [3.0, 4.0]] t.setup(dataset) assert False except TypeError: pass try: dataset = np.random.randn(n, d).astype(np.int32) t.setup(dataset) assert False except ValueError: pass try: dataset = np.random.randn(10, 10, 10) t.setup(dataset) assert False except ValueError: pass dataset = np.random.randn(n, d).astype(np.float32) t.setup(dataset) dataset = np.random.randn(n, d).astype(np.float64) t.setup(dataset) u = np.random.randn(d).astype(np.float64) try: t.find_k_nearest_neighbors(u, 0.5) assert False except TypeError: pass try: t.find_k_nearest_neighbors(u, -1) assert False except ValueError: pass t.find_near_neighbors(u, -1) try: t.set_max_num_candidates(0.5) assert False except TypeError: pass try: t.set_max_num_candidates(-10) assert False except ValueError: pass t.set_num_probes(t._params.l) try: t.set_num_probes(t._params.l - 1) assert False except ValueError: pass try: t.set_num_probes(1000.1) assert False except TypeError: pass def check_check_query(f): try: f(u.astype(np.float32)) assert False except ValueError: pass try: f([0.0] * d) assert False except TypeError: pass try: f(u[:d-1]) assert False except ValueError: pass try: f(np.random.randn(d, d)) assert False except ValueError: pass check_check_query(lambda u: t.find_k_nearest_neighbors(u, 10)) check_check_query(lambda u: t.find_near_neighbors(u, 0.5)) check_check_query(lambda u: t.find_nearest_neighbor(u)) check_check_query(lambda u: t.get_candidates_with_duplicates(u)) check_check_query(lambda u: t.get_unique_candidates(u)) #check_check_query(lambda u: t.get_unique_sorted_candidates(u)) t.find_near_neighbors(u, 0.0)