Exemplo n.º 1
0
	def loadHashmap(self, feature_size, result_n):
		# Create redis storage adapter
		redis_object = Redis(host='localhost', port=6379, db=0)
		redis_storage = RedisStorage(redis_object)
		pdb.set_trace()
		try:
			# Get hash config from redis
			config = redis_storage.load_hash_configuration('test')
			# Config is existing, create hash with None parameters
			lshash = RandomBinaryProjections(None, None)
			# Apply configuration loaded from redis
			lshash.apply_config(config)
			
		except:
			# Config is not existing, create hash from scratch, with 10 projections
			lshash = RandomBinaryProjections('test', 0)
			

		# Create engine for feature space of 100 dimensions and use our hash.
		# This will set the dimension of the lshash only the first time, not when
		# using the configuration loaded from redis. Use redis storage to store
		# buckets.
		nearest = NearestFilter(1000)
		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
		pdb.set_trace()
		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())

		# Do some stuff like indexing or querying with the engine...

		# Finally store hash configuration in redis for later use
		redis_storage.store_hash_configuration(lshash)
Exemplo n.º 2
0
def knn(data, k):
    assert k <= len(
        data
    ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k = k + 1
    n, dimension = data.shape
    ind = []
    dist = []

    if (dimension < 10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp', 10)

    engine = Engine(dimension,
                    lshashes=[rbp],
                    vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)

    for i in range(n):

        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])

    return N, dist, ind
Exemplo n.º 3
0
        def RunAnnNearpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            queryData = np.genfromtxt(self.dataset[1], delimiter=',')
            train, label = SplitTrainData(self.dataset)

            with totalTimer:
                # Get all the parameters.
                try:
                    # Perform Approximate Nearest-Neighbors
                    dimension = train.shape[1]
                    rbp = RandomBinaryProjections('rbp', 10)
                    engine = Engine(dimension, lshashes=[rbp])
                    for i in range(len(train)):
                        engine.store_vector(train[i], 'data_%d' % i)
                    for i in range(len(queryData)):
                        v = engine.neighbours(queryData[i])
                except Exception as e:
                    Log.Info(e)
                    q.put(e)
                    return -1
            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Exemplo n.º 4
0
    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)
Exemplo n.º 5
0
def index_in_text_engine(nid_gen,
                         tfidf,
                         lsh_projections,
                         tfidf_is_dense=False):
    num_features = tfidf.shape[1]
    print("TF-IDF shape: " + str(tfidf.shape))
    text_engine = Engine(num_features,
                         lshashes=[lsh_projections],
                         distance=CosineDistance())

    st = time.time()
    row_idx = 0
    for key in nid_gen:
        if tfidf_is_dense:
            dense_row = tfidf[row_idx]
            array = dense_row
        else:
            sparse_row = tfidf.getrow(row_idx)
            dense_row = sparse_row.todense()
            array = dense_row.A[0]
        row_idx += 1
        text_engine.store_vector(array, key)
    et = time.time()
    print("Total index text: " + str((et - st)))
    return text_engine
def index_user_vectors():

    #print 'Performing indexing with HashPermutations...'

    global engine_perm

    t0 = time.time()

    #print k_dimen, d_dimen

    rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)

    rbp_perm.reset(k_dimen)

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(k_dimen,
                         lshashes=[permutations],
                         distance=CosineDistance())

    for u in user_vector:

        engine_perm.store_vector(user_vector[u], data=u)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
Exemplo n.º 7
0
def load_engine(sdf_files, feature_matrix, dimension):
    """
    Function that converts the given sdf_files into instances of the sdf_class, then loads them into nearpy Engine.

    Parameters
        sdf_files: a list of sdf_files with their pathname from the current directory. Intended to be fed in from `find_sdf(root_dir)`
        feature_matrix: matrix of training data features to be loaded into engine
        dimension: dimensionality of the feature vectors used for LSH (here: number of cluster centers)

    Returns
        engine: instance of a nearpy engine with all of sdf_files loaded
    
    Sample Usage
        >>> engine = load_engine(sdf_files)
    """
    #dimension here can be altered as well
    rbp = RandomBinaryProjections('rbp', 10)
    engine = Engine(dimension, lshashes=[rbp])

    count = 0
    for index, file_ in enumerate(sdf_files):
        #print file_
        if count % 100 == 0:
            print 'Converted %d files' % (count)
        converted = SDF(file_)
        converted.set_feature_vector(feature_matrix[index])
        converted.add_to_nearpy_engine(engine)
        count += 1
    return engine
Exemplo n.º 8
0
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num,
                                         matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension,
                             lshashes=[permutations2],
                             distance=CosineDistance(),
                             vector_filters=[nearest])
def LSH(Layers, K):

    lsh_vectors = database[:, LSH_VECT_START_COL:]
    video_data = database[:, 0:5]

    num_rows, num_cols = lsh_vectors.shape
    dimension = num_cols

    rbp = list()
    for i in range(Layers):
        rbp.append(RandomBinaryProjections(str(i), K))

    # Create engine with pipeline configuration
    engine = Engine(dimension, lshashes=rbp)

    # Index 1000000 random vectors (set their data zo a unique string)
    for index in range(num_rows):
        v = lsh_vectors[index, :]

        meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \
                    + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4])

        engine.store_vector(v, meta_data)

    printOutput(engine.storage.buckets)

    print 'stop'
Exemplo n.º 10
0
def test_nearpy(X_train, y_train, X_test, k):
    # We are looking for the k closest neighbours
    nearest = NearestFilter(k)
    X_train_normalized = []
    for i in range(len(X_train)):
        train_example = X_train[i]
        element = ((train_example / np.linalg.norm(train_example)).tolist(),
                   y_train[i].tolist())
        X_train_normalized.append(element)

    engine = Engine(X_train.shape[1],
                    lshashes=[RandomBinaryProjections('default', 10)],
                    distance=CosineDistance(),
                    vector_filters=[nearest])

    #perform hashing for train examples
    for train_example in X_train:
        engine.store_vector(train_example)

    labels = []
    for test_example in X_test:
        neighbors = engine.neighbours(test_example)
        labels.append([
            train_example[1] for train_example in X_train_normalized
            if set(neighbors[0][0]) == set(train_example[0])
        ])
    return labels
Exemplo n.º 11
0
 def __init__(self, num_features, projection_count=30):
     self.num_features = num_features
     #self.rbp = RandomDiscretizedProjections('default', projection_count, bin_width=100)
     self.rbp = RandomBinaryProjections('default', projection_count)
     #self.rbp = RandomBinaryProjectionTree('default', projection_count, 1)
     self.text_engine = Engine(num_features,
                               lshashes=[self.rbp],
                               distance=CosineDistance())
Exemplo n.º 12
0
    def __init__(self, x):
        self.n, self.f = x.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', 10)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = x[i, :]
            self.engine.store_vector(v, i)
Exemplo n.º 13
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    num, dim = data.base.shape

    # We are looking for the ten closest neighbours
    nearest = NearestFilter(args.topk)
    # We want unique candidates
    unique = UniqueFilter()

    # Create engines for all configurations
    for nbit, ntbl in itertools.product(args.nbits, args.ntbls):
        logging.info("Creating Engine ...")
        lshashes = [RandomBinaryProjections('rbp%d' % i, nbit)
                    for i in xrange(ntbl)]

        # Create engine with this configuration
        engine = Engine(dim, lshashes=lshashes,
                        vector_filters=[unique, nearest])
        logging.info("\tDone!")

        logging.info("Adding items ...")
        for i in xrange(num):
            engine.store_vector(data.base[i, :], i)
            if i % 100000 == 0:
                logging.info("\t%d/%d" % (i, data.nbae))
        logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            reti = [y for x, y, z in
                    np.array(engine.neighbours(data.query[i]))]
            ids[i, :len(reti)] = reti
            if i % 100 == 0:
                logging.info("\t%d/%d" % (i, data.nqry))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Exemplo n.º 14
0
    def fit(self, X):
        b = self.params['b']
        self.n, self.f = X.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', b)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = np.squeeze(np.copy(X[i, :]))
            self.engine.store_vector(v, i)
Exemplo n.º 15
0
def k_nn_lsh_2(k, word, decade_matrix, index_dict):
    num_rows = decade_matrix.get_shape()[0]
    print("the number of rows:" + str(num_rows))
    rbp = RandomBinaryProjections('rbp', 256)
    engine = Engine(num_rows, lshashes=[rbp])
    for i in range(num_rows):
        print(i)

        engine.store_vector(decade_matrix.getrow(i), "data_%d" % i)
    return engine.neighbours(word)
    def __init__(self, dimension, n_bit, alpha):

        self.n_bit = n_bit
        self.dim = dimension
        self.alpha = alpha

        self.sample_space = 2**n_bit

        self.rbp = RandomBinaryProjections('rbp', self.n_bit)
        self.engine = Engine(dimension, lshashes=[self.rbp])
Exemplo n.º 17
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        rbp = RandomBinaryProjections('rbp', 32)
        engine = Engine(f, lshashes=[rbp])

        for i in range(n):
            engine.store_vector(X[i], 'data_%d' % i)

        return engine
Exemplo n.º 18
0
 def __init__(self, hasher, number_of_tables=8, length_of_tables=32, bin_width= 1.0, match_thresh=0.2):
     """
     :param hasher:
     @type hasher: Hasher
     """
     LSHIndex.__init__(self, hasher, match_thresh=match_thresh)
     self.setName(number_of_tables=number_of_tables,length_of_tables=length_of_tables,match_thresh=match_thresh,bin_width=bin_width)
     self.tables = [None]*number_of_tables
     for i in range(number_of_tables):
         self.tables[i] = RandomDiscretizedProjections(str(i), length_of_tables,  bin_width)
     self.engine = Engine(self.hasher.dims(), lshashes=self.tables, fetch_vector_filters=[NoVectorFilter()])
Exemplo n.º 19
0
    def __configure_calculator(self, point_list, point):
        # Dimension of our vector space
        self.__dimension__ = 2

        # Create a random binary hash with 10 bits
        self.__rbp__ = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__])
        self.set_searching_point_list(point_list)
        self.set_query_point(point)
 def __init__(self, emb_path, feature='title'):
     self.emb_path = emb_path
     self.feature = feature
     self.data_df = None
     self.tfidf = Vectorizer(**get_tfidf_params())
     self.fasttext_embedder = None
     self.fasttext_tfidf = None
     self.dimension = 300
     rbp = RandomBinaryProjections('rbp', 2)
     self.engine = Engine(self.dimension, lshashes=[rbp])
     pass
Exemplo n.º 21
0
 def __init__(self, data_points, sim_threshold=0.5, num_vectors=3):
     self.data_points = data_points
     self.point_num = self.data_points.shape[0]
     self.dimension = self.data_points.shape[1] - 1
     # Create a random binary hash with . bits
     self.rbp = RandomBinaryProjections('rbp', num_vectors, rand_seed=42)
     self.engine = Engine(
         self.dimension,
         lshashes=[self.rbp],
         vector_filters=[DistanceThresholdFilter(1 - sim_threshold)])
     for i in range(self.point_num):
         self.engine.store_vector(self.data_points[i, 1:], '%d' % i)
Exemplo n.º 22
0
    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_,
                              lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)
 def fit(self, X, y=None, hash="randbinary"):
     X = np.array(X)
     assert len(X.shape) == 2, "X not 2-rank"
     dimension = X.shape[-1]
     if hash == "randbinary":
         rbp = RandomBinaryProjections('rbp', 10)
     elif hash == "pcabinary":
         rbp = PCABinaryProjections('rbp', 10, training_set=X)
     self.engine = Engine(dimension, lshashes=[rbp])
     index = 0
     for x in X:
         self.engine.store_vector(x, str(index))
         index += 1
Exemplo n.º 24
0
    def test_experiment_with_unibucket_1(self):
        dim = 50
        vector_count = 100
        vectors = numpy.random.randn(dim, vector_count)
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # Both recall and precision must be one in this case
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 1.0)
Exemplo n.º 25
0
Arquivo: util.py Projeto: DaMSL/ddc
def startEngine():
    archive = redis.StrictRedis(host='login-node03', port=6380)
    redis_storage = RedisStorage(archive)
    config = redis_storage.load_hash_configuration('pcahash')
    if not config:
        logging.error("LSHash not configured")
        sys.exit(0)
    #TODO: Gracefully exit
    # lshash = RandomBinaryProjections(None, None)
    lshash = PCABinaryProjections(None, None, None)
    lshash.apply_config(config)
    eng = Engine(num_pc * 454, lshashes=[lshash], storage=redis_storage)
    return eng
Exemplo n.º 26
0
def data_for_layer(basic_path, layer_name, num_folds, experiment,
                   projection_count, start_pc_component, end_pc_component):
    # Read datasets
    basic_path_layer = os.path.join(basic_path, layer_name)

    dataset_files = "ALOI_train_20400.h5"
    hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files),
                   'r')
    dataset_aloi = hd['dataset_1']
    dataset_train_aloi, dataset_test_aloi = split_data_to_test_train(
        dataset_aloi, num_folds, experiment)
    del dataset_aloi
    transformer = TransformImagesPCA(n_components=500)
    transformer.learn_pcs(dataset_train_aloi)
    del dataset_train_aloi

    dataset_files = "Google_train_6675.h5"
    hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files),
                   'r')
    dataset_google = hd['dataset_1']
    dataset_train_google, dataset_test_google = split_data_to_test_train(
        dataset_google, num_folds, experiment)
    del dataset_google
    transformer.learn_pcs(dataset_train_google)
    del dataset_train_google

    dataset_files = "Nexus_train_1180.h5"
    hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files),
                   'r')
    dataset = hd['dataset_1']
    dataset_train, dataset_test = split_data_to_test_train(
        dataset, num_folds, experiment)
    del dataset
    transformer.learn_pcs(dataset_train)
    del dataset_train

    pc_test_nexus = transformer.transform(
        dataset_test)[:, start_pc_component:end_pc_component]
    pc_test_aloi = transformer.transform(
        dataset_test_aloi)[:, start_pc_component:end_pc_component]
    pc_test_google = transformer.transform(
        dataset_test_google)[:, start_pc_component:end_pc_component]

    # Find the LSH vectors
    rbp = RandomBinaryProjections('rbp', projection_count, rand_seed=723657345)
    engine = Engine(end_pc_component - start_pc_component, lshashes=[rbp])

    pc_test_nexus = project_LSH(pc_test_nexus, rbp)
    pc_test_aloi = project_LSH(pc_test_aloi, rbp)
    pc_test_google = project_LSH(pc_test_google, rbp)
    return pc_test_nexus, pc_test_aloi, pc_test_google
Exemplo n.º 27
0
    def start(dataset, test_vector, num_nearest=5):

        # Create a random binary hash with 10 bits
        rbp = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        engine = Engine(dataset.shape, lshashes=[rbp])

        # Index 1000000 random vectors (set their data to a unique string)
        for i, v in dataset:
            engine.store_vector(v, 'data_%d' % i)

        # Get nearest neighbours
        N = engine.neighbours(test_vector)
Exemplo n.º 28
0
 def __init__(self, distanceMeasure="EuclideanDistance"):
     self.res_similar = ResnetSimilarity()
     dimension = 2048
     rbp = RandomBinaryProjections('rbp', 10)
     self.engine = Engine(dimension, lshashes=[rbp])
     if distanceMeasure == "EuclideanDistance":
         self.filehandler = open("hashed_objects/hashed_object_euclidean.pkl", 'rb')
     elif distanceMeasure == "Test":
         self.filehandler = open("hashed_objects/hashed_object_example.pkl", 'rb')
     else:
         self.filehandler = open("hashed_objects/hashed_object_Cosine.pkl", 'rb')
     self.engine = pickle.load(self.filehandler)
     self.filehandler.close()
     print("Hash Table Loaded")
Exemplo n.º 29
0
 def get_engine(self, vocab, vecs):
     logging.info('{} hash functions'.format(self.args.projections))
     hashes = [
         PCABinaryProjections('ne1v', self.args.projections,
                              vecs[:1000, :].T)
     ]
     engine = Engine(vecs.shape[1],
                     lshashes=hashes,
                     distance=[],
                     vector_filters=[])
     for ind, vec in enumerate(vecs):
         if not ind % 100000:
             logging.info('{} words added to nearpy engine'.format(ind))
         engine.store_vector(vec, ind)
     return engine
Exemplo n.º 30
0
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())