def loadHashmap(self, feature_size=129, result_n=1000): #这里参数没有用到 ''' feature_size: hash空间维数大小 result_n :返回多少个最近邻 ''' # Create redis storage adapter redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) try: # Get hash config from redis config = redis_storage.load_hash_configuration('test') # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 0) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. nearest = NearestFilter(result_n) #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
class RedisStorageTest(StorageTest): def setUp(self): self.storage = RedisStorage(Redis()) super(RedisStorageTest, self).setUp() def test_store_vector(self): x = numpy.random.randn(100, 1).ravel() self.check_store_vector(x) def test_store_sparse_vector(self): x = scipy.sparse.rand(100, 1, density=0.1) self.check_store_vector(x) def test_get_all_bucket_keys(self): self.check_get_all_bucket_keys() def test_delete_vector(self): self.check_delete_vector(numpy.ones(100)) def test_store_zero(self): x = numpy.ones(100) hash_name, bucket_name = "tastHash", "testBucket" self.storage.store_vector(hash_name, bucket_name, x, 0) bucket = self.storage.get_bucket(hash_name, bucket_name) _, data = bucket[0] self.assertEqual(data, 0)
class TestStorage(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_memory_storage(self): x = numpy.random.randn(100, 1) bucket_key = '23749283743928748' x_data = ['one', 'two', 'three'] self.memory.store_vector('testHash', bucket_key, x, x_data) X = self.memory.get_bucket('testHash', bucket_key) self.assertEqual(len(X), 1) y = X[0][0] y_data = X[0][1] self.assertEqual(len(y), len(x)) self.assertEqual(type(x), type(y)) for k in range(100): self.assertEqual(y[k], x[k]) self.assertEqual(type(y_data), type(x_data)) self.assertEqual(len(y_data), len(x_data)) for k in range(3): self.assertEqual(y_data[k], x_data[k]) self.memory.clean_all_buckets() self.assertEqual(self.memory.get_bucket('testHash', bucket_key), []) def test_redis_storage(self): self.redis_storage.clean_all_buckets() x = numpy.random.randn(100, 1) bucket_key = '23749283743928748' x_data = ['one', 'two', 'three'] self.redis_storage.store_vector('testHash', bucket_key, x, x_data) X = self.redis_storage.get_bucket('testHash', bucket_key) self.assertEqual(len(X), 1) y = X[0][0] y_data = X[0][1] self.assertEqual(len(y), len(x)) self.assertEqual(type(x), type(y)) for k in range(100): self.assertEqual(y[k], x[k]) self.assertEqual(type(y_data), type(x_data)) self.assertEqual(len(y_data), len(x_data)) for k in range(3): self.assertEqual(y_data[k], x_data[k]) self.redis_storage.clean_all_buckets() self.assertEqual(self.redis_storage.get_bucket('testHash', bucket_key), [])
def __init__(self): redis_object = redis.Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) # Get hash config from redis config = redis_storage.load_hash_configuration('MyHash') if config is None: # Config is not existing, create hash from scratch, with 5 projections self.lshash = RandomBinaryProjections('MyHash', 5) else: # Config is existing, create hash with None parameters self.lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis self.lshash.apply_config(config) # print("HERE") # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. self.engine = Engine(4, lshashes=[self.lshash], storage=redis_storage) redis_storage.store_hash_configuration(self.lshash)
class NearestNeighbourFinder(object): """ Using an already-projected corpus in the form of a VectorCorpus, allow easy queries to find nearest neighbour event chains in the corpus, given a new event chain. """ def __init__(self, model_type, model_name, hash, corpus_path, with_events=False): self.hash = hash self.corpus_path = corpus_path self.model_type = model_type self.model_name = model_name self.with_events = with_events self.model = None self.search_engine = None def init_engine(self, redis_port=6379): # Need to load the model to get information about it self.model = NarrativeChainModel.load_by_type(self.model_type, self.model_name) vector_size = self.model.vector_size # Point the Redis server to the model's database db_filename = "vectors.rdb" model_dir = self.model.get_model_directory(self.model_name) # Prepare an engine for reading vectors from try: redis = Redis(host='localhost', port=redis_port, db=0) except ConnectionError, e: raise RuntimeError("could not connect to redis server on port %s. Is it running? (%s)" % (redis_port, e)) # Set the storage location to be in the model's directory/file redis.config_set("dbfilename", "vectors.rdb") redis.config_set("dir", model_dir) redis_storage = RedisStorage(redis) self.search_engine = Engine(vector_size, lshashes=[self.hash], storage=redis_storage, fetch_vector_filters=[UniqueVectorFilter()], vector_filters=[])
transforms.ToPILImage(), transforms.Resize(64), transforms.CenterCrop(64), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) dimension = 512 r = redis.Redis( host='redis', port=6379, # charset='utf-8', # decode_responses=True, ) redis_storage = RedisStorage(r) # Get hash config from redis config = redis_storage.load_hash_configuration('MyHash') if config is None: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('MyHash', 50) else: # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets.
import web from nearpy import Engine from nearpy.hashes import RandomBinaryProjections from redis import Redis from nearpy.storage import RedisStorage # Dimension of our vector space dimension = 68 # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=0)) engine = Engine(dimension, lshashes=[rbp], storage=redis_storage) db = web.database(dbn='sqlite', db='sorch.db') def insertVector(id, vector): engine.store_vector(vector, id) def getNN(vector): return engine.neighbours(vector) def insertMetadata(id, url, artist, title, length): db.insert(id=id, url=url, artist=artist, title=title, length=length)
def setUp(self): self.storage = RedisStorage(Redis()) super(RedisStorageTest, self).setUp()
class TestHashStorage(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis() self.redis_storage = RedisStorage(self.redis_object) def test_hash_memory_storage_none_config(self): conf = self.memory.load_hash_configuration('nonexistentHash') self.assertIsNone(conf) def test_hash_memory_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.memory.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_memory_storage_rdp(self): hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomDiscretizedProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testRDPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_memory_storage_pcabp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors) self.memory.store_hash_configuration(hash1) hash2 = PCABinaryProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testPCABPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j]) def test_hash_memory_storage_pcadp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCADiscretizedProjections('testPCADPHash', 4, train_vectors, 0.1) self.memory.store_hash_configuration(hash1) hash2 = PCADiscretizedProjections(None, None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testPCADPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j]) def test_hash_redis_storage_none_config(self): conf = self.redis_storage.load_hash_configuration('nonexistentHash') self.assertIsNone(conf) def test_hash_redis_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.redis_storage.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_redis_storage_rdp(self): hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1) hash1.reset(100) self.redis_storage.store_hash_configuration(hash1) hash2 = RandomDiscretizedProjections(None, None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testRDPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_redis_storage_pcabp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors) self.redis_storage.store_hash_configuration(hash1) hash2 = PCABinaryProjections(None, None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testPCABPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j]) def test_hash_redis_storage_pcadp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCADiscretizedProjections('testPCADPHash', 4, train_vectors, 0.1) self.redis_storage.store_hash_configuration(hash1) hash2 = PCADiscretizedProjections(None, None, None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testPCADPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j])
def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis() self.redis_storage = RedisStorage(self.redis_object)
class TestStorage(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis() self.redis_storage = RedisStorage(self.redis_object) def test_memory_storage(self): x = numpy.random.randn(100, 1) bucket_key = '23749283743928748' x_data = ['one', 'two', 'three'] self.memory.store_vector('testHash', bucket_key, x, x_data) X = self.memory.get_bucket('testHash', bucket_key) self.assertEqual(len(X), 1) y = X[0][0] y_data = X[0][1] self.assertEqual(len(y), len(x)) self.assertEqual(type(x), type(y)) for k in range(100): self.assertEqual(y[k], x[k]) self.assertEqual(type(y_data), type(x_data)) self.assertEqual(len(y_data), len(x_data)) for k in range(3): self.assertEqual(y_data[k], x_data[k]) self.memory.clean_all_buckets() self.assertEqual(self.memory.get_bucket('testHash', bucket_key), []) def test_redis_storage(self): self.redis_storage.clean_all_buckets() x = numpy.random.randn(100, 1) bucket_key = '23749283743928748' x_data = ['one', 'two', 'three'] self.redis_storage.store_vector('testHash', bucket_key, x, x_data) X = self.redis_storage.get_bucket('testHash', bucket_key) self.assertEqual(len(X), 1) y = X[0][0] y_data = X[0][1] self.assertEqual(len(y), len(x)) self.assertEqual(type(x), type(y)) for k in range(100): self.assertEqual(y[k], x[k]) self.assertEqual(type(y_data), type(x_data)) self.assertEqual(len(y_data), len(x_data)) for k in range(3): self.assertEqual(y_data[k], x_data[k]) self.redis_storage.clean_all_buckets() self.assertEqual(self.redis_storage.get_bucket('testHash', bucket_key), []) def test_redis_storage_sparse(self): self.redis_storage.clean_all_buckets() x = scipy.sparse.rand(100, 1, density=0.1) bucket_key = '23749283743928748' x_data = ['one', 'two', 'three'] self.redis_storage.store_vector('testHash', bucket_key, x, x_data) X = self.redis_storage.get_bucket('testHash', bucket_key) self.assertEqual(len(X), 1) y = X[0][0] y_data = X[0][1] self.assertEqual(type(x), type(y)) self.assertEqual(x.shape[0], y.shape[0]) self.assertEqual(x.shape[1], y.shape[1]) self.assertTrue((y - x).sum() == 0.0) self.assertEqual(type(y_data), type(x_data)) self.assertEqual(len(y_data), len(x_data)) for k in range(3): self.assertEqual(y_data[k], x_data[k]) self.redis_storage.clean_all_buckets() self.assertEqual(self.redis_storage.get_bucket('testHash', bucket_key), [])
def redisStorage(host='localhost'): return RedisStorage(Redis(host=host, port=6379, db=0))
from nearpy import Engine from nearpy.hashes import RandomBinaryProjections from nearpy.storage import RedisStorage import deepranking.files as files from keras.preprocessing.image import load_img, img_to_array import pandas as pd import numpy as np from keras.applications.vgg16 import preprocess_input from keras.models import load_model from deepranking.fashion_utils import triplet_loss_adapted_from_tf from keras_applications.resnext import preprocess_input import keras from redis import Redis redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) config = redis_storage.load_hash_configuration('MyHash') lshash = RandomBinaryProjections(None, None) lshash.apply_config(config) engine = Engine(4096, lshashes=[lshash], storage=redis_storage) redis_object.close() class FashionSimilarity: def __init__(self): self.model = load_model((files.output_directory / 'onlinemining_loss.h5').absolute().as_posix(), custom_objects={
class TestHashStorage(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_hash_memory_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.memory.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_memory_storage_rdp(self): hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomDiscretizedProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testRDPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_memory_storage_pcabp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors) self.memory.store_hash_configuration(hash1) hash2 = PCABinaryProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testPCABPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j]) def test_hash_memory_storage_pcadp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCADiscretizedProjections('testPCADPHash', 4, train_vectors, 0.1) self.memory.store_hash_configuration(hash1) hash2 = PCADiscretizedProjections(None, None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testPCADPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j]) def test_hash_redis_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.redis_storage.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_redis_storage_rdp(self): hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1) hash1.reset(100) self.redis_storage.store_hash_configuration(hash1) hash2 = RandomDiscretizedProjections(None, None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testRDPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j]) def test_hash_redis_storage_pcabp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors) self.redis_storage.store_hash_configuration(hash1) hash2 = PCABinaryProjections(None, None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testPCABPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j]) def test_hash_redis_storage_pcadp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCADiscretizedProjections('testPCADPHash', 4, train_vectors, 0.1) self.redis_storage.store_hash_configuration(hash1) hash2 = PCADiscretizedProjections(None, None, None, None) hash2.apply_config(self.redis_storage.load_hash_configuration('testPCADPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j])
def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis() self.redis_storage = RedisStorage(self.redis_object) numpy.random.seed(16)
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object)
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis() self.redis_storage = RedisStorage(self.redis_object) numpy.random.seed(16) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors for k in range(200000): x = numpy.random.randn(100) x_data = 'data {}'.format(k) self.engine.store_vector(x, x_data) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config( self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
import json import numpy as np import cPickle as pickle from nearpy import Engine from nearpy.hashes import RandomBinaryProjections from nearpy.storage import RedisStorage from redis import Redis from ne import CosineSim dimension = 100 with open("hndbow.index2word", 'r') as f: index2words = json.load(f) wordvecs = np.load("hndbow.syn0.npy") redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=3)) lshash = RandomBinaryProjections('WordHash', 5, rand_seed=123) engine = Engine(dimension, distance=CosineSim(), lshashes=[lshash], storage=redis_storage) for i,w in enumerate(index2words): vec = wordvecs[i] # 1x100 nparray engine.store_vector(vec, w) redis_storage.store_hash_configuration(lshash)
log.info("Preparing neighbour search hash") # Create binary hash binary_hash = RandomBinaryProjections("%s:%s_binary_hash" % (model_type, model_name), hash_size) log.info("Connecting to Redis server on port %d" % redis_port) # Prepare an engine for storing the vectors in try: redis = Redis(host='localhost', port=redis_port, db=0) except ConnectionError, e: raise RuntimeError("could not connect to redis server on port %s. Is it running? (%s)" % (redis_port, e)) # Set the storage location to be in the model's directory redis.config_set("dbfilename", "vectors.rdb") redis.config_set("dir", model_dir) # Use this as the storage engine for the nearest-neighbour index redis_storage = RedisStorage(redis) search_engine = Engine(vector_size, lshashes=[binary_hash], storage=redis_storage) for vector, source, chain in VectorCorpus.project_from_docs(corpus, model_type, model_name, progress=progress, buffer_size=10000, project_events=project_events, filter_chains=filter_chains): data = (source, chain) if include_events else source search_engine.store_vector(vector, data) finder = NearestNeighbourFinder(model_type, model_name, binary_hash, corpus.directory, with_events=include_events) log.info("Storing finder in %s" % os.path.join(model_dir, "neighbour_finder"))