Пример #1
    def LSHtable(self, file, euclidean=True, number_of_tables=50, hash_fx=18):
        input:                  2-D numpy array
        output:                 LSH table

        :file:                  2-D numpy array of document vectors
        :distance_function:     [EuclideanSquared, NegativeInnerProduct]
        :number_of_tables:      (default=50)
        :num_of_rotations:      1
        :seed:                  5721840
        :num_setup_threads:     0
        :hash_fx:               18 (2^18 hash tables)

        dataset = file
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(dataset[0])
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        if euclidean == True:
            params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
            params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(hash_fx, params_cp)
        # Construct the LSH table
        LSHtable = falconn.LSHIndex(params_cp)
        return LSHtable
Пример #2
def search(query,number):
    dataset = np.load("/Users/liupengcheng/Downloads/final_data.npy")
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(dataset[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 50
    # we set one rotation, since the data is dense enough,
    # for sparse data set it to 2
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(18, params_cp)
    table = falconn.LSHIndex(params_cp)
    query_object = table.construct_query_object()
    number_of_probes = 3816
    result = query_object.find_k_nearest_neighbors(query,number)
    return result
Пример #3
    def __init__(self, dataset):

        number_of_queries = 10
        # we build only 50 tables, increasing this quantity will improve the query time
        # at a cost of slower preprocessing and larger memory footprint, feel free to
        # play with this number
        number_of_tables = 50

        params_cp = falconn.LSHConstructionParameters()

        params_cp.dimension = len(dataset[0])
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        # we set one rotation, since the data is dense enough,
        # for sparse data set it to 2
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        self.params_cp = params_cp

        # we build 18-bit hashes so that each table has
        # 2^18 bins; this is a good choise since 2^18 is of the same
        # order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(18, params_cp)

        print('Constructing the LSH table')
        self.table = falconn.LSHIndex(params_cp)
        self.data = dataset
        self.query_object = self.table.construct_query_object()
Пример #4
    def __set_hierarchical_LSH_Index(self, cluster, number_of_tables, hash_bit):
        #Function defintion: Returns the LSH Index for hierarchical clustering
        # -- Read LSH for more information or README.2
        #params ---
        #cluster: the set of vectors wished to be clustered.
        # number_of_tables: the number of tables used in each nearest neighbor search (see LSH section line 114)
        #hash_bit: Used to determine the strength of the hash_function see README.2 or LSH for more detail
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(cluster[0])
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1 #Parameter associated with crosspolytope see Falconnn for more
        params_cp.seed = 5721840
    # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        hash_bit = math.floor(math.log(len(cluster),2))
        # we build 32-bit hashes so that each table has
        # 2^32 bins; this is a good choise since 2^32 is of the same
        # order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(hash_bit, params_cp) #Look at typical number of hash functions
        #Figure out how number of hashfunctions are determined.

        print('Constructing the LSH Index For Cluster Combine Method.')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        t2 = timeit.default_timer()
        print('Construction time: {}'.format(t2 - t1))

        self.hierarchical_LSH_Index = table.construct_query_object()
Пример #5
def make_tables(dataset,
    p = partial(print, file=sys.stderr) if verbose else lambda *a, **kw: None
    norms = np.linalg.norm(dataset, axis=1)
    if copy:
        dataset = dataset / norms[:, np.newaxis]
        dataset /= norms[:, np.newaxis]

    normed_mean = dataset.mean(axis=0)
    dataset -= normed_mean

    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = dataset.shape[1]
    params_cp.lsh_family = 'cross_polytope'
    params_cp.distance_function = 'euclidean_squared'
    params_cp.l = num_tables
    params_cp.num_rotations = 1  # try 2, maybe
    params_cp.seed = seed if seed is not None else np.random.randint(2**31)
    params_cp.num_setup_threads = num_threads
    params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
    n_bits = int(np.round(np.log2(dataset.shape[0])))
    falconn.compute_number_of_hash_functions(n_bits, params_cp)

    p('Starting building table...', end='')
    table = falconn.LSHIndex(params_cp)

    return table, normed_mean
Пример #6
def test_number_of_hash_functions():
    params = falconn.LSHConstructionParameters()

    params.lsh_family = 'hyperplane'
    params.dimension = 10
    falconn.compute_number_of_hash_functions(5, params)
    assert params.k == 5

    params.lsh_family = 'cross_polytope'
    falconn.compute_number_of_hash_functions(5, params)
    assert params.k == 1
    assert params.last_cp_dimension == 16

    params.dimension = 100
    params.lsh_family = 'hyperplane'
    falconn.compute_number_of_hash_functions(8, params)
    assert params.k == 8

    params.lsh_family = 'cross_polytope'
    falconn.compute_number_of_hash_functions(8, params)
    assert params.k == 1
    assert params.last_cp_dimension == 128

    falconn.compute_number_of_hash_functions(10, params)
    assert params.k == 2
    assert params.last_cp_dimension == 2
Пример #7
    def _init_falconn(
        import falconn

        assert nb_tables >= self._NEIGHBORS

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables
Пример #8
def falconn_table(sig_mat):
    ''' Construct a falconn table with given signature. Return
    a falconn table and the random seed used (for random rotation)
    to construct the falconn table.

    Keyword Argument:

    sig_mat -- A numpy ndarray, where each row is signature at a time
    window center

    # pre-processing the signature matrix
    # coerce the ndarray into 32-bit floating number

    if sig_mat.dtype != np.float32:
        sig_mat = sig_mat.astype(np.float32)

    # Normalize and center the signature matrix so that
    # the observations are on a unit hypersphere
    sig_mat /= max(1e-6, max(np.linalg.norm(sig_mat, axis=1).reshape(-1, 1)))
    center = np.mean(sig_mat, axis=0)
    sig_mat -= center

    # Instantiate the parameters for the falconn table
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(sig_mat[0])
    # Set the LSH family to be Cross Polytope
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    # Set the distance function to be the L2_norm
    # which is the cosine distance
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    # # Set the randomly-picked seed for table construction
    # params_cp.seed = cp_seed
    # Set the number of random rotation, since the signature is very likely
    # a large sparse matrix
    params_cp.num_rotations = 2
    # select the number of hash tables
    params_cp.l = 50
    params_cp.seed = 5721840
    # Set the thread usage (0 for using all) and storage formats of the
    # falconn table
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = (

    # select the number of hash functions according the size
    # of the signature matrix
    num_obs = sig_mat.shape[0]
    bit_num = int(np.log2(num_obs))
    falconn.compute_number_of_hash_functions(bit_num, params_cp)

    # Construct falconn table with configured parameters
    falconn_tab = falconn.LSHIndex(params_cp)

    return falconn_tab
Пример #9
def hyperplane_hashing_params(dimensions):
    params_hp = falconn.LSHConstructionParameters()
    params_hp.dimension = dimensions
    params_hp.lsh_family = falconn.LSHFamily.Hyperplane
    params_hp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_hp.storage_hash_table = falconn.StorageHashTable.FlatHashTable
    params_hp.k = 19
    params_hp.l = 10
    params_hp.num_setup_threads = 1

    return params_hp
Пример #10
def cross_polytope_hashing_params(dimensions):
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = dimensions
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.storage_hash_table = falconn.StorageHashTable.FlatHashTable
    params_cp.k = 3
    params_cp.l = 10
    params_cp.num_setup_threads = 1
    params_cp.last_cp_dimension = 16
    params_cp.num_rotations = 3

    return params_cp
Пример #11
    def build_LSH_index(self):
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = self.vectorized_articles.shape[1]
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
        params_cp.l = 200
        params_cp.num_rotations = 1
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        falconn.compute_number_of_hash_functions(21, params_cp)
        self.table = falconn.LSHIndex(params_cp)

        self.query = self.table.construct_query_object()
Пример #12
    def init_lsh(self):
    Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
        self.query_objects = {
        }  # contains the object that can be queried to find nearest neighbors at each layer.
        # mean of training data representation per layer (that needs to be substracted before LSH).
        self.centers = {}
        for layer in self.layers:
            assert self.nb_tables >= self.neighbors

            # Normalize all the lenghts, since we care about the cosine similarity.
            self.train_activations_lsh[layer] /= np.linalg.norm(
                self.train_activations_lsh[layer], axis=1).reshape(-1, 1)

            # Center the dataset and the queries: this improves the performance of LSH quite a bit.
            center = np.mean(self.train_activations_lsh[layer], axis=0)
            self.train_activations_lsh[layer] -= center
            self.centers[layer] = center

            # LSH parameters
            params_cp = falconn.LSHConstructionParameters()
            params_cp.dimension = len(self.train_activations_lsh[layer][1])
            params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
            params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
            params_cp.l = self.nb_tables
            params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
            params_cp.seed = 5721840
            # we want to use all the available threads to set up
            params_cp.num_setup_threads = 0
            params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

            # we build 18-bit hashes so that each table has
            # 2^18 bins; this is a good choice since 2^18 is of the same
            # order of magnitude as the number of data points

            print('Constructing the LSH table')
            table = falconn.LSHIndex(params_cp)

            # Parse test feature vectors and find k nearest neighbors
            query_object = table.construct_query_object()
            self.query_objects[layer] = query_object
Пример #13
def setup_lsh(X, num_probes=100):
    assert X.ndim == 2
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = X.shape[1]
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 100
    params_cp.num_rotations = 1
    params_cp.seed = 1234
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    falconn.compute_number_of_hash_functions(16, params_cp)

    table = falconn.LSHIndex(params_cp)
    query_object = table.construct_query_object()

    return query_object
Пример #14
def _create_bucket(segments):
    """ Creates a bucket of segments
    to use for LSH similarity lookup
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(segments[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 25
    params_cp.num_rotations = 2
    params_cp.seed = 5721840
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = (
    falconn.compute_number_of_hash_functions(18, params_cp)

    table = falconn.LSHIndex(params_cp)

    return (segments, table)
Пример #15
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     if self._metric == 'angular':
         X /= numpy.linalg.norm(X, axis=1).reshape(-1,  1)
     self._center = numpy.mean(X, axis=0)
     X -= self._center
     import falconn
     self._params = falconn.LSHConstructionParameters()
     self._params.dimension = X.shape[1]
     self._params.distance_function = 'euclidean_squared'
     self._params.lsh_family = 'cross_polytope'
     falconn.compute_number_of_hash_functions(self._num_bits, self._params)
     self._params.l = self._num_tables
     self._params.num_rotations = 1
     self._params.num_setup_threads = 0
     self._params.storage_hash_table = 'flat_hash_table'
     self._params.seed = 95225714
     self._index = falconn.LSHIndex(self._params)
     self._buf = numpy.zeros((X.shape[1],), dtype=numpy.float32)
Пример #16
    def setup_second_layer(self, number_of_tables=50):
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = self.X.shape[1] + 1
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(15, params_cp)

        print('Constructing the LSH table')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        self.X_ = self.X_.astype('float')
        t2 = timeit.default_timer()
        print('Construction time: {}'.format(t2 - t1))

        self.query_object = table.construct_query_object()
Пример #17
    def __falconn_fit(self):
        Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.

        import falconn

        dimension = self.features.shape[1]
        nb_tables = self.kwargs['nb_tables']
        number_bits = self.kwargs['number_bits']

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables

        # Center the dataset and the queries: this improves the performance of LSH quite a bit.
        self.center = np.mean(self.features, axis=0)
        self.features -= self.center

        # add features to falconn table
Пример #18
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     if self._metric == 'hamming':
         # replace all zeroes by -1
         X[X < 0.5] = -1
     if self._metric == 'angular' or self._metric == 'hamming':
         X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1)
     self._center = numpy.mean(X, axis=0)
     X -= self._center
     self._params = falconn.LSHConstructionParameters()
     self._params.dimension = X.shape[1]
     self._params.distance_function = falconn.DistanceFunction.EuclideanSquared
     self._params.lsh_family = falconn.LSHFamily.CrossPolytope
     falconn.compute_number_of_hash_functions(self._num_bits, self._params)
     self._params.l = self._num_tables
     self._params.num_rotations = 1
     self._params.num_setup_threads = 0
     self._params.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
     self._params.seed = 95225714
     self._index = falconn.LSHIndex(self._params)
     self._query_object = self._index.construct_query_object()
Пример #19
def generate_candidate_threshold(entity_embedding=None, data_ids="OpenEA", path="", threshold=0.2, output_path=False,
                                 entity_file="ent_embeds,npy", normalize=True, metric="euclidean", lsh_family="hyperplane", number_of_tables=500):

    :param entity_embedding:
    :param data_ids:
    :param path:
    :param threshold:
    :param output_path:
    :param entity_file:
    :param normalize:
    :param metric:  1.inner 向量的内积, 2.euclidean 欧几里的距离(l2 normaliztion 后与cosine distance 成正比)。
    :param lsh_family:

    if entity_embedding is None:
        entity_file_path = path + entity_file
        entity_embedding = np.load(entity_file_path)
        print("Load [%s] successfully!" % (entity_file_path))

    if data_ids is "OpenEA":
        ent2id1, id2ent1, max_id = read_ent_id(path + "kg1_ent_ids")
        ent2id2, id2ent2, max_id = read_ent_id(path + "kg2_ent_ids")
        paths = path.split('/')
        test_path = "/".join([paths[1], paths[2], paths[3], "datasets", paths[7], paths[8], paths[9]])
        test_ids = []
        with open('/' + test_path + r"/test_links", 'r', encoding='utf-8') as f:
            for line in f.readlines():
                items = line.strip().split("\t")
                id1, id2 = int(ent2id1[items[0]]), int(ent2id2[items[1]])
                # maxx_id = max(maxx_id, id1, id2)
                test_ids.append([id1, id2])
        data_ids = test_ids

    if data_ids is "dbp15k":
        # train_ids = read_ids(path+"sup_ent_ids")
        test_ids = read_ids(path + "ref_ent_ids")  # 只考虑测试集上匹配
        # test_ids.extend(train_ids)
        data_ids = test_ids
    data_ids = np.array(data_ids).astype(int)
    entity_embedding = entity_embedding.astype(np.float32)
    if metric == "euclidean":
        entity_embedding -= np.mean(entity_embedding, axis=0)

    Lvec = np.array([entity_embedding[e] for e in data_ids[:, 0]])
    Rvec = np.array([entity_embedding[e] for e in data_ids[:, 1]])
    if os.path.exists(path + "mapping_mat.npy"):   # OpenEA模型转换后的最终向量
        mapping = np.load(path + "mapping_mat.npy")
        #print("mapping shape:", mapping.shape)
        Lvec = np.matmul(Lvec, mapping)
        #print("load mapping succussuflly!")

    if normalize:
        Lvec = preprocessing.normalize(Lvec, norm="l2", axis=1)
        Rvec = preprocessing.normalize(Rvec, norm="l2", axis=1)

    seed = 119417657
    L_True = data_ids[:, 0].tolist()
    print("shape:", entity_embedding.shape)
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = entity_embedding.shape[1]
    if lsh_family == "crosspolytope":
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    elif lsh_family == "hyperplane":
        params_cp.lsh_family = falconn.LSHFamily.Hyperplane
    if metric == "euclidean":
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    elif metric == "inner":
        params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct

    params_cp.l = number_of_tables
    params_cp.num_rotations = 1
    params_cp.seed = seed
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 2
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(20, params_cp)
    # print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    t2 = timeit.default_timer()

    print('Construction time: {}'.format(t2 - t1))
    query_object = table.construct_query_object()
    number_of_probes = number_of_tables
    print('Choosing number of probes: ', number_of_probes)
    t1 = timeit.default_timer()
    true_cnt = 0
    total = 0
    true_all = data_ids.shape[0]
    node_pairs = []
    print("Metric:", metric, "Threshold:", threshold)
    for ids_index, pair in enumerate(data_ids):
        ans = query_object.find_near_neighbors(Rvec[ids_index], threshold=threshold)
        for index in range(len(ans)):
            if pair[0] == L_True[ans[index]]:
                true_cnt += 1
                node_pairs.append((pair[0], pair[1], 1))
                node_pairs.append((L_True[ans[index]], pair[1], 0))
        total += len(ans)
    print('Threshold:[%f] True cnt:[%d] Generate All cnt:[%d] Total:[%d] Recall:[%f] P/E ratio:[%f] Metric:[%s]'
          % (threshold, true_cnt, total, true_all, true_cnt/true_all, total/true_all, metric))

    t2 = timeit.default_timer()
    print('Generate Candidate time: {}'.format(t2 - t1))
    if output_path == True:
        output_path = "/".join(path.split('/')[:-1]) + '/topk_' + str(threshold) + '_name_ngram'
        print('output path:', output_path)
        with open(output_path, 'w', encoding='utf8') as f:
            for pair in node_pairs:
                f.writelines(pair[0] + '\t' + pair[1] + '\t' + str(pair[2]) + '\n')
Пример #20
def lsh_for_ccd(dataset: np.array, queries: list, methoddict: dict,
                lastIndexBefore: int):
    number_of_tables = 10
    # queries = dataset

    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(dataset[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = number_of_tables
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    falconn.compute_number_of_hash_functions(18, params_cp)

    print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    t2 = timeit.default_timer()
    print('Construction time: {}'.format(t2 - t1))

    query_object = table.construct_query_object()
    methodfilterset = set()

    currentIter = lastIndexBefore
    totalIter = len(dataset)

    for query in queries:
        neighbors = query_object.find_near_neighbors(query, threshold=endTheta)
        for neighbor in neighbors:
            queryMdKey = hashlib.md5(str(query.tolist()).encode()).hexdigest()
            neighborMdKey = hashlib.md5(
            # 13ccdCodeLineSeparate112443321234ccdTokenSeparate/home/xxx/xx.java,3,15ccdFileKeySeparate/home/xxx/xx.java,31,45
            left = str(methoddict[queryMdKey])
            right = str(methoddict[neighborMdKey])
            ccdLeft = left.split("ccdCodeLineSeparate")
            ccdRight = right.split("ccdCodeLineSeparate")
            methodsLeftLine = ccdLeft[0]
            methodsRightLine = ccdRight[0]
            ccdTokenLeft = ccdLeft[1].split('ccdTokenSeparate')
            ccdTokenRight = ccdRight[1].split('ccdTokenSeparate')
            methodsLeft = ccdTokenLeft[1]
            methodsRight = ccdTokenRight[1]
            methodsLeftToken = ccdTokenLeft[0]
            methodsRightToken = ccdTokenRight[0]

            if queryMdKey == neighborMdKey:
                tmpStr = methodsLeft
                if "ccdFileKeySeparate" in tmpStr:
                    tmpArr = tmpStr.split("ccdFileKeySeparate")
                    if len(tmpArr) == 2:
                        result = getCloneTuple(tmpArr[0] + "," + tmpArr[1])
                        for i in range(0, len(tmpArr)):
                            for j in range(i + 1, len(tmpArr)):
                                result = getCloneTuple(tmpArr[i] + "," +

            if neighbor > currentIter:
                if not lineFilter(int(methodsLeftLine), int(methodsRightLine)):
                    dist = np.linalg.norm(query - dataset[neighbor])
                    dist *= dist
                    if dist <= optTheta:
                        getCloneResult(methodsLeft, methodsRight)
                        beta = betaMain(methodsLeftToken, methodsRightToken)
                        if beta <= minbeta:
                        dist = getOptDist(beta, dist)
                        if dist < cloneTheta:
                            getCloneResult(methodsLeft, methodsRight)
        currentIter = currentIter + 1
        # print("%d / %d \r" % (currentIter, totalIter))
Пример #21
        QueryIndexWithVector_red = [[int(i), allDenseVector_HSV_red[int(i)]]
                                    for i in Sset]
        queries_blue = [x[1] for x in QueryIndexWithVector_blue]
        queries_green = [x[1] for x in QueryIndexWithVector_green]
        queries_red = [x[1] for x in QueryIndexWithVector_red]

        # print('Centering the dataset and queries')
        # center = np.mean(dataset, axis=0)
        # dataset -= center
        # queries -= center
        # print('Done')
        #assert dataset.dtype == np.float32
        number_of_probes = [900]

        params_cp_blue = falconn.LSHConstructionParameters()
        params_cp_blue.dimension = len(dataset_blue[0])
        params_cp_blue.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp_blue.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp_blue.l = number_of_tables
        params_cp_blue.num_rotations = 1
        params_cp_blue.seed = 666666
        params_cp_blue.num_setup_threads = 1
        params_cp_blue.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(20, params_cp_blue)

        print('Constructing the LSH table')
        t1 = timeit.default_timer()
        table_blue = falconn.LSHIndex(params_cp_blue)
        t2 = timeit.default_timer()
Пример #22
 def get_params():
     return fa.LSHConstructionParameters()
Пример #23
    def set_clustering_LSH_Index(self, number_of_queries, query_accuracy, number_of_tables, hash_bit):
        #Function defintion: Returns the LSH Index -- Read LSH for more information or
        # README.2

        #number_of_queries:The number of queries used to determine the number_of_probes
        #query_accuracy: Specifies the level of accuracy of the Index
        #Setting query_accuracy = 1 degenerates LSH index into linear search.
        #number_of_tables:the number of hash_tables used for a given nearest
        #neighbor search
        #hash_bit: Used to determine the number of hash functions. READ_ME for detail.
        print("Setting Clustering Index")

        queries = self.w2v_vectors[(len(self.w2v_vectors)-number_of_queries):]
        w2v_vectors = self.w2v_vectors[:(len(self.w2v_vectors)-number_of_queries)]

        #Normalize vectors
        center = np.mean(w2v_vectors, axis=0)
        w2v_vectors -= center
        queries -= center

        #perform linear scan to return correct answers
        answers = self.linearScan_answerGenerator(w2v_vectors, queries)

        #Set number of probes----
        print('Choosing number of probes')
        init_number_of_probes = 600
        # END -------

        #Parameters -----
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(w2v_vectors[0]) # = 50 for Glove6B.50d
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(hash_bit, params_cp)
        # END ------

        #Constructing LSH Index -----
        print('Constructing the LSH Index')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        t2 = timeit.default_timer()
        print('Construction time: {}'.format(t2 - t1))
        query_object = table.construct_query_object()
        number_of_probes = self.probeGenerator(query_accuracy, init_number_of_probes, query_object, answers, queries, number_of_tables)

        # Performance Statistics
        t1 = timeit.default_timer()
        score = 0
        for (i, query) in enumerate(queries):
            if query_object.find_nearest_neighbor(query) == answers[i]:
                score += 1
        t2 = timeit.default_timer()
        print('Query time: {}'.format((t2 - t1) / len(queries)))
        print('Precision: {}'.format(float(score) / len(queries)))
        self.query_object = query_object
        print("Vectors Successfully Hashed. Clustering LSH Index Created")
Пример #24
# Author : fcbruce <*****@*****.**>
# Time : Sat 06 May 2017 17:10:14

import numpy as np
import falconn as fa

a = np.random.randn(50000, 500)
a /= np.linalg.norm(a, axis=1).reshape(-1, 1)

print "pending..."
params_cp = fa.LSHConstructionParameters()
params_cp.dimension = 500
params_cp.lsh_family = 'cross_polytope'
params_cp.distance_function = 'euclidean_squared'
params_cp.l = 7
params_cp.num_rotations = 1
params_cp.seed = 11111
params_cp.num_setup_threads = 0
params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
fa.compute_number_of_hash_functions(18, params_cp)

table = fa.LSHIndex(params_cp)

print "find"
print table.find_nearest_neighbor(a[1])
Пример #25
print('Computing true nearest neighbors via a linear scan ...')
true_nns = []
average_scan_time = 0.0
for query in queries:
  start = timeit.default_timer()
  best_index = np.argmax(np.dot(data, query))
  stop = timeit.default_timer()
  average_scan_time += (stop - start)
average_scan_time /= num_queries
print('Average query time: {} seconds'.format(average_scan_time))

# Hyperplane hashing
params_hp = falconn.LSHConstructionParameters()
params_hp.dimension = d
params_hp.lsh_family = 'hyperplane'
params_hp.distance_function = 'negative_inner_product'
params_hp.k = 19
params_hp.l = 10
params_hp.seed = seed ^ 833840234

print('Hyperplane hash\n')

start = timeit.default_timer()
hp_table = falconn.LSHIndex(params_hp)
stop = timeit.default_timer()
hp_construction_time = stop - start
# In[ ]:

# Get all nearest neighbors for all the datapoint
    'timeit', 'index.knnQueryBatch(dataset, k=5, num_threads=16)')
neighbors = index.knnQueryBatch(dataset, k=5, num_threads=16)

# ### Falconn

# In[ ]:

import falconn

# In[ ]:

parameters = falconn.LSHConstructionParameters()
num_tables = 1
parameters.l = num_tables
parameters.dimension = num_dimensions
parameters.distance_function = falconn.DistanceFunction.EuclideanSquared
parameters.lsh_family = falconn.LSHFamily.CrossPolytope
parameters.num_rotations = 1
parameters.num_setup_threads = 1
parameters.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
falconn.compute_number_of_hash_functions(16, parameters)

index = falconn.LSHIndex(parameters)
get_ipython().run_line_magic('time', 'index.setup(dataset)')

query_object = index.construct_query_object()
num_probes = 1