class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
def load_search_engine(): global engine # read in the data file data = pandas.read_csv(os.path.join('data', 'features.tsv'), sep='\t') data_objects = pandas.read_csv(os.path.join('data', 'object_features.tsv'), sep='\t') # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(len(data['features'][0].split(',')), lshashes=[rbp], distance=EuclideanDistance()) # indexing for i in range(0, len(data)): engine.store_vector( np.asarray(data['features'][i].split(',')).astype('float64'), data['filename'][i].replace('images\\\\', '').replace('images\\', '').replace('images/', '')) for i in range(0, len(data_objects)): engine.store_vector( np.asarray( data_objects['features'][i].split(',')).astype('float64'), data_objects['filename'][i].replace('images\\\\', '').replace( 'images\\', '').replace('images/', '')) return engine
def knn(data, k): assert k <= len( data ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k = k + 1 n, dimension = data.shape ind = [] dist = [] if (dimension < 10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N, dist, ind
def index_user_vectors(): print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds', (t1-t0)
class LSH: def __init__(self, path, dataSize): self.path = path self.dataSize = dataSize def preprocess(self): ids = [] meta = [] data = [] for i in range(self.dataSize): with open(self.path + str(i) + ".data", "rb") as file: f_song_id = pickle.load(file) f_songMeta = pickle.load(file) f_data = pickle.load(file) ids.append(f_song_id) meta.append(f_songMeta) data.append(f_data) self.id = np.array(ids) self.meta = np.array(meta) self.data = np.array(data) def generate_hashtable(self): self.engine = Engine(self.data.shape[1], lshashes=[RandomBinaryProjections('rbp', 20)]) for i in range(self.dataSize): self.engine.store_vector(self.data[i], data=self.id[i]) def query(self, data): return self.engine.neighbours(data)
def index_user_vectors(): #print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() #print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time()
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
def RunAnnNearpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) queryData = np.genfromtxt(self.dataset[1], delimiter=',') train, label = SplitTrainData(self.dataset) with totalTimer: # Get all the parameters. try: # Perform Approximate Nearest-Neighbors dimension = train.shape[1] rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) for i in range(len(train)): engine.store_vector(train[i], 'data_%d' % i) for i in range(len(queryData)): v = engine.neighbours(queryData[i]) except Exception as e: Log.Info(e) q.put(e) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
class StateDBEngine(object): def __init__(self): # initialize "nearby" library self.dim = 4 self.rbp = RandomBinaryProjections('rbp', 100) self.engine = Engine(self.dim, lshashes=[self.rbp]) # performance counter self.counter = 0 def add(self, x, data): # print 'add data = ', data self.engine.store_vector(x, data) self.counter += 1 def lookup(self, x, THRESHOLD=0.1): naver = self.engine.neighbours(x) if len(naver) == 0: return None pt, data, d = naver[0] # print 'lhs, rhs', x, pt, # print 'd = ', d, (d < THRESHOLD), (data is None) if d < THRESHOLD: return data else: return None
def index_in_text_engine(nid_gen, tfidf, lsh_projections, tfidf_is_dense=False): num_features = tfidf.shape[1] print("TF-IDF shape: " + str(tfidf.shape)) text_engine = Engine(num_features, lshashes=[lsh_projections], distance=CosineDistance()) st = time.time() row_idx = 0 for key in nid_gen: if tfidf_is_dense: dense_row = tfidf[row_idx] array = dense_row else: sparse_row = tfidf.getrow(row_idx) dense_row = sparse_row.todense() array = dense_row.A[0] row_idx += 1 text_engine.store_vector(array, key) et = time.time() print("Total index text: " + str((et - st))) return text_engine
class PointCalculator(): def __init__(self, point_list, point): self.__configure_calculator(point_list, point) def __configure_calculator(self, point_list, point): # Dimension of our vector space self.__dimension__ = 2 # Create a random binary hash with 10 bits self.__rbp__ = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__]) self.set_searching_point_list(point_list) self.set_query_point(point) def __load_point_list_in_engine(self): for index in xrange(0, len(self.__point_list__)): v = numpy.array(self.__point_list__[index]) self.__engine__.store_vector(v, 'data_%d' % index) def set_searching_point_list(self, point_list): self.__point_list__ = point_list self.__load_point_list_in_engine() def set_query_point(self, point): self.__point__ = point def __get_nearest_point(self): return self.__engine__.neighbours(numpy.array(self.__point__)) def get_nearest_point_array_coords(self): nearest_point = self.__get_nearest_point() return [nearest_point[0][0][0], nearest_point[0][0][1]]
class RandomBinaryNN(NearestNeighbor): """ Nearest neighbor implementation by using random binary trees from nearpy package """ def __init__(self, dimension: int, number_projections: int, threshold: float): """ :param dimension: Number of dimensions of input points :param number_projections: Number of random projections used for finding nearest neighbors. Trade-off: More projections result in a smaller number of false positives in candidate set :param threshold: Distance threshold for definition nearest: all points within this specific distance """ self.rbp = RandomBinaryProjections('rbp', number_projections) self.sqdist = SquaredEuclideanDistance() self.ann_engine = Engine( dimension, lshashes=[self.rbp], distance=self.sqdist, vector_filters=[DistanceThresholdFilter(threshold)]) def insert_candidate(self, point: np.ndarray, metadata): self.ann_engine.store_vector(point, data=metadata) def get_candidates(self, point: np.ndarray): return [ NearestNeighborResult(res[0], res[1], res[2]) for res in self.ann_engine.neighbours(point) ]
def LSH(Layers, K): lsh_vectors = database[:, LSH_VECT_START_COL:] video_data = database[:, 0:5] num_rows, num_cols = lsh_vectors.shape dimension = num_cols rbp = list() for i in range(Layers): rbp.append(RandomBinaryProjections(str(i), K)) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=rbp) # Index 1000000 random vectors (set their data zo a unique string) for index in range(num_rows): v = lsh_vectors[index, :] meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \ + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4]) engine.store_vector(v, meta_data) printOutput(engine.storage.buckets) print 'stop'
def knn(data,k): assert k<=len(data)-1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k=k+1 n,dimension = data.shape ind = [] dist = [] if(dimension<10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp',10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N,dist,ind
def test_nearpy(X_train, y_train, X_test, k): # We are looking for the k closest neighbours nearest = NearestFilter(k) X_train_normalized = [] for i in range(len(X_train)): train_example = X_train[i] element = ((train_example / np.linalg.norm(train_example)).tolist(), y_train[i].tolist()) X_train_normalized.append(element) engine = Engine(X_train.shape[1], lshashes=[RandomBinaryProjections('default', 10)], distance=CosineDistance(), vector_filters=[nearest]) #perform hashing for train examples for train_example in X_train: engine.store_vector(train_example) labels = [] for test_example in X_test: neighbors = engine.neighbours(test_example) labels.append([ train_example[1] for train_example in X_train_normalized if set(neighbors[0][0]) == set(train_example[0]) ]) return labels
class NearPy(NearestNeighbor): def __init__(self, dist=EuclideanDistance(), phi=lambda x: x): NearestNeighbor.__init__(self, dist, phi) def _create_engine(self, k, lshashes=None): self.k_ = k self.engine_ = Engine(self.dimension_, lshashes, distance=self.dist_metric_, vector_filters=[NearestFilter(k)]) for i, feature in enumerate(self.featurized_): if self.transpose_: self.engine_.store_vector(feature.T, i) else: self.engine_.store_vector(feature, i) def train(self, data, k=10): self.data_ = np.array(data) self.featurized_ = self.featurize(data) shape = featurized[0].shape assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)' if len(shape) == 1: self.transpose_ = False self.dimension_ = shape[0] else: assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)' self.transpose_ = (shape[0] == 1) self.dimension_ = shape[1] if self.transpose_ else shape[0] logging.info('Constructing nearest neighbor data structure.') train_start = time.clock() self._create_engine(k) train_end = time.clock() # logging.info('Took %f sec' %(train_end - train_start)) def within_distance(x, dist=0.5, return_indices=False): raise NotImplementedError def nearest_neighbors(self, x, k, return_indices=False): # HACK: load all data back into new engine if k doesn't match if k != self.k_: self._create_engine(k) feature = self.phi_(x) if self.transpose_: query_result = self.engine_.neighbours(feature.T) else: query_result = self.engine_.neighbours(feature) if len(query_result) == 0: return [], [] features, indices, distances = zip(*query_result) if return_indices: return list(indices), list(distances) else: indices = np.array(indices) return list(self.data_[indices]), list(distances)
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000, 200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash print '\nNeighbour distances with permuted index:' query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine without permutations meta-hash print '\nNeighbour distances without permuted index (distances should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1, 200)) dists = CosineDistance().distance_matrix(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print dists[:10]
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def k_nn_lsh_2(k, word, decade_matrix, index_dict): num_rows = decade_matrix.get_shape()[0] print("the number of rows:" + str(num_rows)) rbp = RandomBinaryProjections('rbp', 256) engine = Engine(num_rows, lshashes=[rbp]) for i in range(num_rows): print(i) engine.store_vector(decade_matrix.getrow(i), "data_%d" % i) return engine.neighbours(word)
class GenerateHashTable(): def __init__(self, measure="EuclideanDistance", data_path='data/classed_data/'): self.res = ResnetSimilarity() self.pbar = ProgressBar() # Dimension of our vector space self.dimension = 2048 self.data_path = data_path # Create a random binary hash with 10 bits self.rbp = RandomBinaryProjections('rbp', 10) self.measure = measure self.msote = MemoryStorage() if measure == "EuclideanDistance": self.engine = Engine(self.dimension, lshashes=[self.rbp], storage=self.msote, distance=EuclideanDistance()) else: self.engine = Engine(self.dimension, lshashes=[self.rbp], storage=self.msote, distance=CosineDistance()) def generate_table(self): if self.measure == "CosineDistance": save_path = "hashed_objects/hashed_object_Cosine.pkl" elif self.measure == "EuclideanDistance": save_path = "hashed_objects/hashed_object_euclidean.pkl" else: save_path = "hashed_objects/" + str(self.measure) + ".pkl" count = 0 for subdir, dirs, files in os.walk(self.data_path): for file in files: if '.jpg' in file: img_path = os.path.join(subdir, file) img = Image.open(img_path).convert('RGB') if img.size[0] >= 100: img_emb = self.res.getMapping(img) img_emb = img_emb.view(-1, 2048) img_emb = img_emb.numpy() self.engine.store_vector(img_emb[0], img_path) if count % 1000 == 0: print("Saving Image Embedding ", count) count += 1 print("Saving File To", save_path) # TODO this is peculiar filehandler = open(save_path, 'wb') pickle.dump(self.engine, filehandler)
class CFiltering: def __init__(self, matrix, max_neighbours=20, lshashes=[RandomBinaryProjections("rbp", 10)], vector_filters=[UniqueFilter()], distance=Pearson()): if not isinstance(lshashes, list): raise TypeError("'lshashes' must be an instance of 'list'") if not isinstance(vector_filters, list): raise TypeError("'vector_filters' must be an instance of 'list'") self.underlying = Engine(len(matrix[0]), lshashes=lshashes, vector_filters=vector_filters + [NearestFilter(max_neighbours)], distance=distance) for vector in matrix: self.underlying.store_vector(vector) def predict(self, vector, precision): neighbours = self.underlying.neighbours(vector) if not neighbours: raise ValueError("Failed to acquire any neighbours") average = [ sum(neighbour) / len(neighbour) for neighbour, _, _ in neighbours ] avg = sum(vector) / len(vector) for i in range(len(vector)): if vector[i] < precision: weighted_sum = 0 for j, neighbour in enumerate(neighbours): neighbour, _, similarity = neighbour weighted_sum += similarity * (neighbour[j] - average[j]) vector[i] = avg + weighted_sum / len(vector) return vector
def build_index(self, X): f = X.shape[1] n = X.shape[0] rbp = RandomBinaryProjections('rbp', 32) engine = Engine(f, lshashes=[rbp]) for i in range(n): engine.store_vector(X[i], 'data_%d' % i) return engine
def get_engine(self, vocab, vecs): logging.info('{} hash functions'.format(self.args.projections)) hashes = [PCABinaryProjections('ne1v', self.args.projections, vecs[:1000,:].T)] engine = Engine(vecs.shape[1], lshashes=hashes, distance=[], vector_filters=[]) for ind, vec in enumerate(vecs): if not ind % 100000: logging.info( '{} words added to nearpy engine'.format(ind)) engine.store_vector(vec, ind) return engine
def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0)
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000,200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash print '\nNeighbour distances with permuted index:' query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine without permutations meta-hash print '\nNeighbour distances without permuted index (distances should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,200)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets) == 0)
def build_index(self, X): f = X.shape[1] n = X.shape[0] rbp = RandomBinaryProjections('rbp', 32) engine = Engine(f, lshashes=[rbp]) for i in range(n): engine.store_vector(X[i], 'data_%d' % i) return engine
class Neighbors: """ Nearest neighbors. """ def __init__(self, config, verbose=True, log_file=None): # set up logger self._logger = Logger.get_logger(self.__class__.__name__, log_file=log_file, silence=(not verbose), global_log_file=verbose) # read config self._parse_config(config) self._engine = None def _parse_config(self, config): self._num_neighbors = config["num_neighbors"] def _build_engine(self, dimension): # build NearPy engine self._logger.info("Building engine...") self._engine = Engine( dimension, vector_filters=[NearestFilter(self._num_neighbors)]) def store(self, vectors, data=None, log_freq=10, verbose=True): self._logger.info("Storing vectors...") if data is not None: assert vectors.shape[0] == len( data), "Dim 0 of vectors and data must match!" if self._engine is None: self._build_engine(vectors.shape[-1]) num_vectors = vectors.shape[0] for idx in xrange(num_vectors): if verbose and idx % log_freq == 0: self._logger.info("Storing vector {} of {}...".format( idx, num_vectors)) if data is not None: self._engine.store_vector(vectors[idx], data[idx]) else: self._engine.store_vector(vectors[idx]) def predict(self, vectors, log_freq=10, verbose=True): self._logger.info("Predicting...") num_vectors = vectors.shape[0] neighbors = [] for idx in xrange(num_vectors): if verbose and idx % log_freq == 0: self._logger.info("Predicting vector {} of {}...".format( idx, num_vectors)) neighbors.append(self._engine.neighbours(vectors[idx])) return neighbors
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) numpy.random.seed(11) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000, 200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) permuted_dists = [x[2] for x in results] # Do random query on engine without permutations meta-hash (distances # should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] self.assertLess(permuted_dists[0], dists[0])
class LSHIndex(Index): def __init__(self, hasher, number_of_tables=6, length_of_tables=12, match_thresh=0.2, association_thresh=0.1, storage=memoryStorage): """ :param hasher: @type hasher: Hasher """ Index.__init__(self, hasher, number_of_tables=number_of_tables, length_of_tables=length_of_tables, match_thresh=match_thresh, association_thresh=association_thresh) self.hasher = hasher self.match_thresh = match_thresh self.association_thresh = association_thresh self.tables = [None]*number_of_tables for i in range(number_of_tables): self.tables[i] = RandomBinaryProjections(str(i), length_of_tables) self.engine = Engine(self.hasher.dims(), lshashes=self.tables, storage=storage(), fetch_vector_filters=[NoVectorFilter()]) def index(self, id, img): item = self.hasher.hash(id, img) for i in range(len(item.descriptors)): self.engine.store_vector(item.descriptors[i],data=(id, item.keypoints[i], item.descriptors[i])) return item def find(self, id, img, index_if_not_found=False): item = self.hasher.hash(id, img) matches = {} #count_min =self.association_thresh * float(len(item.descriptors)) for x in item.descriptors: for neighbour in self.engine.neighbours(x): if neighbour[1][0] in matches: continue y = neighbour[1][2] dist = l2norm(x, y) key = neighbour[1][0] if dist < self.match_thresh: #if dist > 0.0001: # print('{} {} {}'.format(id, neighbour[1][0], dist)) matches[key] = (matches[key] + 1) if key in matches else 1 if id not in matches and index_if_not_found: for i in range(len(item.descriptors)): self.engine.store_vector(item.descriptors[i], data=(id, item.keypoints[i], item.descriptors[i])) #for id, count in matches.items(): # #if count >= count_min: # yield id return list(matches.keys())
def start(dataset, test_vector, num_nearest=5): # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(dataset.shape, lshashes=[rbp]) # Index 1000000 random vectors (set their data to a unique string) for i, v in dataset: engine.store_vector(v, 'data_%d' % i) # Get nearest neighbours N = engine.neighbours(test_vector)
def _build_rdp_engine(self,matrix,rdp,normals): # Dimension of our vector space dimension = np.shape(matrix)[1] n = np.shape(matrix)[0] # Create a random binary hash with 10 bits # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage()) rdp.vectors = normals for index in range(n): v = matrix[index] engine.store_vector(v, '%d' % index) return engine
def get_engine(self, vocab, vecs): logging.info('{} hash functions'.format(self.args.projections)) hashes = [ PCABinaryProjections('ne1v', self.args.projections, vecs[:1000, :].T) ] engine = Engine(vecs.shape[1], lshashes=hashes, distance=[], vector_filters=[]) for ind, vec in enumerate(vecs): if not ind % 100000: logging.info('{} words added to nearpy engine'.format(ind)) engine.store_vector(vec, ind) return engine
class lshNN(NNs): """ Locality-sensitive hashing by random projection consider some options nearpy implementation """ def __init__(self, b=16): self.params = {"method": "product quantization, numpy", 'b': b} def fit(self, X): b = self.params['b'] self.n, self.f = X.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', b) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = np.squeeze(np.copy(X[i, :])) self.engine.store_vector(v, i) def _get_one_knn(self, v, k=3): v = np.squeeze(np.copy(v)) vl = v.shape if vl[0] != self.f: # print(vl) raise Exception("Data Not Match") N = self.engine.neighbours(v) nni = -np.ones(k, dtype='int') nnd = np.empty(k) nnd[:] = np.nan for i in np.arange(k): try: nni[i] = N[i][1] nnd[i] = N[i][2] except IndexError: break return (nni, nnd) def get_knn(self, x, k=3): self.n, self.f = x.shape nni = -np.ones((self.n, k), dtype='int') nnd = np.empty((self.n, k)) nnd[:] = np.nan for i in np.arange(self.n): i_i, i_d = self._get_one_knn(x[i, :], k) nni[i, :] = i_i nnd[i, :] = i_d return (nni, nnd)
class LSHRandomProjectionsIndex: def __init__(self, num_features, projection_count=30): self.num_features = num_features self.rbp = RandomBinaryProjections('default', projection_count) self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance()) def index(self, vector, key): if len(vector) != self.num_features: print("ERROR received vector.dim: " + str(len(vector)) + " on engine.dim: " + str(self.num_features)) raise Exception self.text_engine.store_vector(vector, key) def query(self, vector): res = self.text_engine.neighbours(vector) return res
def build_lsh(data, hashbits=10): # Build a locality sensitive hashed database with (data), of bit-depth (hashbits) dimensions = data.shape[1] # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', hashbits) # Create engine with pipeline configuration engine = Engine(dimensions, lshashes=[rbp]) # Index 1000000 random vectors (set their data to a unique string) for index in range(len(data)): engine.store_vector(data[index], '%d' % index) return engine
def build(self, train, batch_size=64, converter=convert_seq, device=0): train_iter = chainer.iterators.SerialIterator(train, batch_size, repeat=False) train_iter.reset() act_list = [[] for _ in range(self.n_dknn_layers)] label_list = [] print('caching hiddens') n_batches = len(train) // batch_size for i, train_batch in enumerate(tqdm(train_iter, total=n_batches)): data = converter(train_batch, device=device, with_label=True) text = data['xs'] labels = data['ys'] with chainer.using_config('train', False): _, dknn_layers = self.model.predict(text, dknn=True) assert len(dknn_layers) == self.model.n_dknn_layers for i in range(self.n_dknn_layers): layer = dknn_layers[i] layer.to_cpu() act_list[i] += [x for x in layer.data] label_list.extend([int(x) for x in labels]) self.act_list = act_list self.label_list = label_list if self.lsh: print('using Locally Sensitive Hashing for NN Search') else: print('using KDTree for NN Search') self.tree_list = [] # one lookup tree for each dknn layer for i in range(self.n_dknn_layers): print('building tree for layer {}'.format(i)) if self.lsh: # if lsh n_hidden = act_list[i][0].shape[0] rbpt = RandomBinaryProjectionTree('rbpt', 75, 75) tree = Engine(n_hidden, lshashes=[rbpt]) for j, example in enumerate(tqdm(act_list[i])): assert example.ndim == 1 assert example.shape[0] == n_hidden tree.store_vector(example, j) else: # if kdtree tree = KDTree(act_list[i]) self.tree_list.append(tree)
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertEqual(len(engine2.storage.buckets), 0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
class DB: def __init__(self, feature_size=16, nearest_neighbours=1000): self.feature_size = feature_size self.nn = nearest_neighbours self.engine = None self.load_hashmap() def load_hashmap(self): # Create redis storage adapter # need to start redis service redis_object = Redis(host='localhost', port=6379, db=14) redis_storage = RedisStorage(redis_object) try: config = redis_storage.load_hash_configuration('test') lshash = RandomBinaryProjections(None, None) lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 10) nearest = NearestFilter(self.nn) # self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(self.feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=CosineDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash) def query(self, fvector): query = np.asarray(fvector) # get nn nearest neighbours # a list of tuple (data, name, distance) N = self.engine.neighbours(query) return N def append_to_DB(self, fvector, name=""): if fvector is None: return self.engine.store_vector(np.asarray(fvector), name)
def create_hashing(sets): # Dimension of the vector space. dimension = len(sets[0][0]) # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rbp]) # Index all our values (set their data to a unique string). for index, s in enumerate(sets): for v in s: engine.store_vector(v, 'data_%d' % index) return engine
def test_sparse(): dim = 500 num_train = 1000 num_test = 1 train_data = ss.rand(dim, num_train)#pickle.load('/home/jmahler/Downloads/feature_objects.p') test_data = ss.rand(dim, num_test) rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dim, lshashes=[rbp]) for i in range(num_train): engine.store_vector(train_data.getcol(i)) for j in range(num_test): N = engine.neighbours(test_data.getcol(j)) print N IPython.embed()
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
class lshsearcher: def __init__(self): self.__dimension = None self.__engine_perm = None self.__permutations = None def _set_confval(self, dimension=None): if dimension is None: return None else: self.__dimension = dimension def _engine_on(self): # Create permutations meta-hash self.__permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.__permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance()) def conf(self, dimension): self._set_confval(dimension) self._engine_on() def getData(self, v): if self.__engine_perm is not None: self.__engine_perm.store_vector(v) def commitData(self): if self.__permutations is not None: self.__permutations.build_permuted_index() def find(self, v): if self.__engine_perm is not None: return self.__engine_perm.neighbours(v)
class testing_suite: """ Class to test SDF files in a nearest neighbor lookup format, under different models of representation such as PCA, FactorAnalysis, KernelPCA with the rbf kernel, FastICA, and DictionaryLearning Sample Usage: test=testing_suite() test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver") num_train=12 num_test=4 test.make_train_test(num_train,num_test) accuracy,results=test.perform_PCA_tests() """ def __init__(self): self.PCA_changed_=True self.FA_changed_=True self.KPCA_changed_=True self.FICA_changed_=True self.DL_changed_=True self.all_files_=[] self.PCA_=None self.FA_ = None self.KPCA_ = None self.FICA_ = None self.DL_ = [] self.testing_=[] self.training_=[] self.engine_=[] self.training_vectors_=None self.confusion_={} self.biggest=0 def adddir(self,dir_to_add): """ add all sdf filepaths from a root directory tree (dir_to_add) to the all_files_ instance variable """ sdf_files = [] for root,dirs,files in walk(dir_to_add): for file_ in files: if file_.endswith("25.sdf"): sdf_files.append(path.join(root,file_)) self.all_files_+=sdf_files def adddir_25(self,dir_to_add): """add files in a directory only with dimension 12""" sdf_files = [] for root,dirs,files in walk(dir_to_add): for file_ in files: if file_.endswith(".sdf"): tempsdf=SDF(path.join(root,file_)) if tempsdf.dimensions()[0]==25*25*25: sdf_files.append(path.join(root,file_)) self.all_files_+=sdf_files def addfile(self,file_to_add): """add only one file to all_files""" self.all_files_.append(file_to_add) def make_train_test(self,num_train, num_test): """ populates the list of training files and testing files with filepaths based on a random number generator seeded with np.random.seed(100) Sample Usage: test=testing_suite() test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver") num_train=12 num_test=4 test.make_train_test(num_train,num_test) """ assert num_train+num_test<=len(self.all_files_) np.random.seed(100) permuted_indices = np.random.permutation(len(self.all_files_)) get_training = itemgetter(*permuted_indices[:num_train]) get_testing = itemgetter(*permuted_indices[num_train:num_train+num_test]) if num_train > 1: self.training_ = get_training(self.all_files_) else: self.training_= [get_training(self.all_files_)] if num_test > 1: self.testing_ = get_testing(self.all_files_) else: self.testing_ = [get_testing(self.all_files_)] def normalize_vector(self,vector,largest_dimension): """normalizes smaller sdf vectors to a larger size by vertical stacking a column of zeros underneath""" return np.vstack((vector,np.zeros((largest_dimension-vector.shape[0],1)))) def get_PCA_training_vectors(self): """ gets all training_vectors from the set of training files, normalizes them using normalize vector and adds them all to a numpy array that gets returned """ training_sdf=[SDF(i) for i in list(self.training_)] self.biggest=0 for item in training_sdf: self.biggest=max(self.biggest,item.dimensions()[0]) return_train_vectors=None for tempsdf in training_sdf: vectorized=np.reshape(tempsdf.data(),(tempsdf.dimensions()[0],1)) normal_vector=self.normalize_vector(vectorized,self.biggest) if return_train_vectors==None: return_train_vectors=normal_vector else: return_train_vectors=np.concatenate((return_train_vectors,normal_vector),axis=1) return return_train_vectors """ -any function begining with make creates the sklearn.decomposition framework for the specified decomposition type -any function begining with fit fits the training vectors to the decomposition framework -any function begining with transform transforms the training vectors based on the fitted decomposition framework """ def render_sdf(self, a, thresh = 1e-3): h = plt.figure() ax = h.add_subplot(111, projection = '3d') surface_points = np.where(np.abs(a) < thresh) x = surface_points[0] y = surface_points[1] z = surface_points[2] ax.scatter(x, y, z) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_xlim3d(0,a.shape[0]) ax.set_ylim3d(0,a.shape[1]) ax.set_zlim3d(0,a.shape[2]) plt.show() def make_PCA(self): self.PCA_=skdec.PCA()#n_components='mle') def fit_PCA(self,training_vectors): self.PCA_.fit(training_vectors) def make_FA(self): self.FA_=skdec.FactorAnalysis(n_components=len(list(self.training_))) def fit_FA(self,training_vectors): self.FA_.fit(training_vectors) def make_KPCA(self,kernel_option="rbf"): self.KPCA_=skdec.KernelPCA(gamma=0.1, kernel=kernel_option) def fit_KPCA(self,training_vectors): self.KPCA_.fit(training_vectors) def make_FICA(self): self.FICA_=skdec.FastICA(n_components=len(list(self.training_))) def fit_FICA(self,training_vectors): self.FICA_.fit(training_vectors) def make_DL(self,alpha_values): self.DL_.append(skdec.DictionaryLearning(n_components=len(list(self.training_)),alpha= alpha_values,transform_algorithm = 'omp')) def fit_DL(self,training_vectors): self.DL_[-1].fit(training_vectors) def load_PCA(self,vector_set): """reinitializes our engine and loads a numpy set of vectors of dimension (self.biggest,1) into self.engine_""" rbp = RandomBinaryProjections('rbp', 10) self.engine_ = Engine(self.PCA_.components_.shape[1], lshashes=[rbp]) transformed_vectors = self.PCA_.transform(vector_set.T) for i in range(len(list(self.training_))): #vector=vector_set[:,i] #vector=np.reshape(vector,(self.biggest,1)) #vector=self.PCA_.transform(vector) self.engine_.store_vector(transformed_vectors[i,:], self.training_[i]) def load_FA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.FA_.transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i]) def load_KPCA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.KPCA_.alphas_.shape[1], lshashes=[rbp]) transformed_vectors = self.KPCA_.transform(vector_set.T) for i in range(len(list(self.training_))): #vector=vector_set[:,i] #vector=np.reshape(vector,(self.biggest,1)) #vector=self.KPCA_.transform(vector) self.engine_.store_vector(transformed_vectors[i,:], self.training_[i]) def load_FICA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.FICA_.transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i]) def load_DL(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.DL_[-1].transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i]) def engine_query(self,test_vector): """ queries the engine with a (self.biggest,1) dimension vector and returns the file_names of nearest neighbors and the results """ #print test_vector #reshaped=np.reshape(test_vector,(self.biggest,1)) results = self.engine_.neighbours(test_vector.T) file_names = [i[1] for i in results] return file_names, results def setup_confusion(self): """ reinitializes the self.confusion_ confusion matrix variable """ self.confusion_={} self.confusion_[UNKNOWN_TAG] = {} for file_ in self.all_files_: category = cat50_file_category(file_) self.confusion_[category] = {} for query_cat in self.confusion_.keys(): for pred_cat in self.confusion_.keys(): self.confusion_[query_cat][pred_cat] = 0 """ Makes a test vector by taking in an SDF, reshaping it, normalizing it, then returns a transformed version of that vector based on the corresponding decomposition model that was already trained """ def make_test_vector(self,sdf_array,vector_type): if vector_type=="PCA": return self.make_PCA_test_vector(sdf_array) elif vector_type=="FA": return self.make_FA_test_vector(sdf_array) elif vector_type=="KPCA": return self.make_KPCA_test_vector(sdf_array) elif vector_type=="FICA": return self.make_FICA_test_vector(sdf_array) elif vector_type=="DL": return self.make_DL_test_vector(sdf_array) def make_DL_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) normalized=self.normalize_vector(reshaped,self.biggest) return self.DL_[-1].transform(normalized)[:,0] def make_FICA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) normalized=self.normalize_vector(reshaped,self.biggest) return self.FICA_.transform(normalized)[:,0] def make_KPCA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) return self.KPCA_.transform(reshaped.T) # reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) # normalized=self.normalize_vector(reshaped,self.biggest) # return self.KPCA_.transform(normalized)[:,0] def make_FA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) normalized=self.normalize_vector(reshaped,self.biggest) return self.FA_.transform(normalized)[:,0] def make_PCA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) return self.PCA_.transform(reshaped.T) # IPython.embed() # normalized=self.normalize_vector(reshaped,self.biggest) # return self.PCA_.transform(normalized)[:,0] """ querys the loaded and trained engine with each of your test vectors from make_train_test Returns accuracy: float representing the accuracy of querying the nearpy engine with the test results test_results: dictionary of the results from the "testing" for each of the sdf_files """ def perform_tests(self,K,test_type): test_results={} for file_ in list(self.testing_): query_category=cat50_file_category(file_) print "Querying: %s with category %s "%(file_, query_category) converted = SDF(file_) test_vector=self.make_test_vector(converted,test_type) closest_names, closest_vals=self.engine_query(test_vector.T[:,0]) pred_category=UNKNOWN_TAG if len(closest_names)>0: closest_category=closest_names[0] pred_category=cat50_file_category(closest_category) for i in range(1,min(K,len(closest_names))): closest_category = closest_names[i] potential_category = cat50_file_category(closest_category) if potential_category == query_category: pred_category = potential_category print "Result Category: %s"%(pred_category) self.confusion_[query_category][pred_category] += 1 test_results[file_]= [(closest_names, closest_vals)] row_names=self.confusion_.keys() confusion_mat=np.zeros([len(row_names),len(row_names)]) i=0 for query_cat in self.confusion_.keys(): j = 0 for pred_cat in self.confusion_.keys(): confusion_mat[i,j] = self.confusion_[query_cat][pred_cat] j += 1 i += 1 # get true positives, etc for each category num_preds = len(self.testing_) tp = np.diag(confusion_mat) fp = np.sum(confusion_mat, axis=0) - np.diag(confusion_mat) fn = np.sum(confusion_mat, axis=1) - np.diag(confusion_mat) tn = num_preds * np.ones(tp.shape) - tp - fp - fn # compute useful statistics recall = tp / (tp + fn) tnr = tn / (fp + tn) precision = tp / (tp + fp) npv = tn / (tn + fn) fpr = fp / (fp + tn) accuracy = np.sum(tp) / num_preds # correct predictions over entire dataset # remove nans recall[np.isnan(recall)] = 0 tnr[np.isnan(tnr)] = 0 precision[np.isnan(precision)] = 0 npv[np.isnan(npv)] = 0 fpr[np.isnan(fpr)] = 0 return accuracy, test_results, recall, tnr, precision,npv,fpr def vis_pca_components(self, num_comp_vis, thresh = 0.01, method = 'PCA'): PCA = self.PCA_ if method == 'KPCA': PCA = self.KPCA_ num_components = PCA.components_.shape[0] num_components = min(num_comp_vis, num_components) comp_per_dim = int(math.ceil(math.sqrt(num_components))) h = plt.figure() for i in range(num_components): ax = h.add_subplot(comp_per_dim, comp_per_dim, i+1, projection = '3d') components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) surface_points = np.where(np.abs(comp_grid) < thresh) x = surface_points[0] y = surface_points[1] z = surface_points[2] ax.scatter(x, y, z) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_xlim3d(0,25) ax.set_ylim3d(0,25) ax.set_zlim3d(0,25) ax.set_title('Component %d'%(i)) plt.show() def vis_pca_component_slices(self, num_comp_vis, method = 'PCA'): PCA = self.PCA_ if method == 'KPCA': PCA = self.KPCA_ num_components = PCA.components_.shape[0] num_components = min(num_comp_vis, num_components) comp_per_dim = int(math.ceil(math.sqrt(num_components))) plt.figure() for i in range(num_components): plt.subplot(comp_per_dim, comp_per_dim, i+1) components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) comp_slice = comp_grid[:,:,12] plt.imshow(comp_slice) plt.title('Component %d XY Plane'%(i)) plt.figure() for i in range(num_components): plt.subplot(comp_per_dim, comp_per_dim, i+1) components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) comp_slice = comp_grid[:,12,:] plt.imshow(comp_slice) plt.title('Component %d XZ Plane'%(i)) plt.figure() for i in range(num_components): plt.subplot(comp_per_dim, comp_per_dim, i+1) components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) comp_slice = comp_grid[12,:,:] plt.imshow(comp_slice) plt.title('Component %d YZ Plane'%(i)) plt.show() """ runs perform_tests on a specific type of decomposition after creating that decomposition type framework with the training vectors and loading those training vectors into the engine K is the number of neighbors to check """ def perform_PCA_tests(self,K): train_vectors=self.get_PCA_training_vectors() self.make_PCA() print 'Fitting PCA' self.fit_PCA(train_vectors.T) print 'Loading PCA' self.load_PCA(train_vectors) print 'Setup confusion' self.setup_confusion() print 'Eval accuracy' #IPython.embed() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"PCA") return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_FA_tests(self,K): train_vectors=self.get_PCA_training_vectors() self.make_FA() self.fit_FA(train_vectors) self.load_FA(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FA") return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_KPCA_tests(self,K,kernel="rbf"): train_vectors=self.get_PCA_training_vectors() self.make_KPCA(kernel_option=kernel) print 'Fitting KCPA' self.fit_KPCA(train_vectors.T) print 'Loading KPCA' self.load_KPCA(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"KPCA") IPython.embed() return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_FICA_tests(self,K): train_vectors=self.get_PCA_training_vectors() self.make_FICA() self.fit_FICA(train_vectors) self.load_FICA(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FICA") return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_DL_tests(self,K,alpha): train_vectors=self.get_PCA_training_vectors() self.make_DL(alpha_values=alpha) self.fit_DL(train_vectors) self.load_DL(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"DL") return accuracy,test_results, recall, tnr, precision,npv,fpr def get_engine(self): return self.engine_ def get_PCA(self): return self.PCA_ def get_FA(self): return self.FA_ def get_KPCA(self): return self.KPCA_ def get_FICA(self): return self.FICA_ def get_DL(self): return self.DL_ def get_explained_variance_ratio(self): return self.PCA_.explained_variance_ratio_
# Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=0)) engine = Engine(dimension, lshashes=[rbp], storage=redis_storage) index = 0 with open("Adele.csv", "rb") as csvfile: featurereader = csv.reader(csvfile, delimiter=',') for row in featurereader: index = index+1 x = numpy.array(row, dtype='|S4') y = x.astype(numpy.float) engine.store_vector(y, 'Adele - Hello_%d' % index) index=0 with open("BlurredLines.csv", "rb") as csvfile: featurereader = csv.reader(csvfile, delimiter=',') for row in featurereader: index = index+1 x = numpy.array(row, dtype='|S4') y = x.astype(numpy.float) engine.store_vector(y, 'Robin Thicke - Blurred Lines ft. T.I. Pharrell_%d' % index) index=0 with open("CallMeMaybe.csv", "rb") as csvfile: featurereader = csv.reader(csvfile, delimiter=',') for row in featurereader: index = index+1
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def query(self, person_list): dists = [] scores = [] for person in person_list: query = map(float, self.face_feature[person].split(',')) print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % self.engine.candidate_count(query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists] res = zip(dists, scores, t_num) res.sort(key = lambda t: t[1]) res1 = self.f7(res, person_list) return res1[:self.neighbour] def true_num(self, person): return self.ground_truth[person] def f7(self, zip_seq, person_list): seen = set() seen_add = seen.add return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
from nearpy.distances import EuclideanDistance from nearpy.storage import MemoryStorage from nearpy import Engine #from redis import Redis #from nearpy.storage import RedisStorage from nearpy.storage import GonzaloStorage #load the visual features of all the images from the dataset featIN=h5py.File('featIN.mat')['featIN'] #Create binary projections and save them in HD rbp = RandomBinaryProjections('rbp', 10) dimension=4096 #Trying redis gonzalo_storage = GonzaloStorage() engine = Engine(dimension, lshashes=[rbp], distance=EuclideanDistance(),vector_filters=[NearestFilter(20)], storage=gonzalo_storage) fp = open('engine.txt', 'w') pickle.dump(engine, fp) fp.close() #engine = Engine(dimension, lshashes=[rbp]) for index in range(1000000): v=featIN[range(dimension),index] #v=numpy.float16(featIN[range(dimension),index]) engine.store_vector(v, 'data_%d' % index) engine.save_all()
def example1(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 10000 print 'Creating engines' # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('rbpt', 20, 20) # Create engine 1 engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance()) # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 20) # Create engine 2 engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance()) # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 20) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine 3 engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine 3 engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) print 'Indexing %d random vectors of dimension %d' % (POINTS, DIM) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine.store_vector(v) engine_rbpt.store_vector(v) engine_perm.store_vector(v) engine_perm2.store_vector(v) print 'Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()) print 'Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()) print 'Building permuted index for HashPermutations' # Then update permuted index permutations.build_permuted_index() print 'Generate random data' # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 1 print '\nNeighbour distances with RandomBinaryProjectionTree:' print ' -> Candidate count is %d' % engine_rbpt.candidate_count(query) results = engine_rbpt.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 2 print '\nNeighbour distances with RandomBinaryProjections:' print ' -> Candidate count is %d' % engine.candidate_count(query) results = engine.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 4 print '\nNeighbour distances with HashPermutations2:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
class RMAX_repr(Representation): """ Identical to Tabular representation (ie assigns a binary feature function f_{d}() to each possible discrete state *d* in the domain, with f_{d}(s) = 1 when d=s, 0 elsewhere. HOWEVER, unlike *Tabular*, feature functions are only created for *s* which have been encountered in the domain, not instantiated for every single state at the outset. """ def __init__(self, domain, Rmax, LQ, k = 1, epsilon_d = 0.01): # LQ is the lipschitz constant - 10**3 according to the paper (by Cross Validn) self.LQ = LQ self.gamma = domain.discount_factor self.rmax = Rmax self.qmax = Rmax / (1-self.gamma) self.qmax_tilda = Rmax + self.gamma * self.qmax self.epsilon = epsilon_d # Approximate k-NN is used when finding the Q value of a point self.k = k # We also keep track of the states sampled so far self.sample_list = [0]*(2*100000) self.list_idx = 0 # And a dictionary for quick lookups of already computed values self.sample_values = {} # And we use an LSH to find the approximate k-Nearest neighbours # by training it on every s, a, r, s' tuple we see self.init_randomization() super( RMAX_repr, self).__init__( domain) def init_randomization(self): rbp = RandomBinaryProjections('rbp', 10) from nearpy.distances import ChebyshevDistance self.engine = Engine(7, lshashes = [rbp], vector_filters=[NearestFilter(self.k)], distance=ChebyshevDistance()) def is_known(self, s, a): # A s, a pair is 'known' if LQ * d(s, a, s', a') < epsilon_d indices = self.approx_nn(s, a) if not indices: return False for idx in indices: s_p, a_p = self.sample_list[idx] if self.LQ * self.d(s, a, s_p, a_p) > self.epsilon: return False return True def pre_discover(self, s, p_terminal, a, r, ns, terminal): # In the learning stage, if sa is not 'known' add it to the sample list # and its value to sample value. if not self.is_known(s, a): x = r + self.gamma * max(self.Q_tilda(ns, a_p) for a_p in range(self.actions_num)) self.engine.store_vector(np.append(s, a), self.list_idx) self.sample_list[self.list_idx]= (s, a) self.list_idx+=1 self.sample_values[self.sa_tuple(s, a)] = x #self.LSH.partial_fit(np.append(s, a)) super(RMAX_repr, self).pre_discover(s, p_terminal, a, ns, terminal) # Compute a distance metric between (s, a) and (ns, na). # Using max-norm as in the paper for now. def d(self, s, a, ns, na): # Create one big s,a array sa = np.append(s, a) nsa = np.append(ns, na) # Use scipy to compute the chebyshev distance => Max norm return distance(sa, nsa) def approx_nn(self, s, a): #dist, indices = self.LSH.kneighbors(np.append(s, a)) # returns a list of l = self.engine.neighbours(np.append(s, a)) indices = [elem[1] for elem in l] return indices def sa_tuple(self, s, a): return tuple(np.append(s, a)) # The approximate Q function def Q_tilda(self, s, a): k = self.k q = 0.0 # First get the k-nearest sampled neighbours to this point using LSH indices = self.approx_nn(s, a) num_neighbors = 0 for index in indices: sj, aj = self.sample_list[index] dij = self.d(s, a, sj, aj) if dij <= (self.qmax / self.LQ): xj = self.sample_values[self.sa_tuple(sj, aj)] q += dij * self.LQ + xj num_neighbors += 1 # In case there were less than k neighbors - Use Qmax_tilda for the remaining for i in range(num_neighbors, k): q += self.qmax_tilda # Return the average Q return q/k def Qs(self, s, terminal, phi_s=None): # Q -> Array of Q(s, a) values for this state # A -> Corresponding IDs # Before any learning is done, the experiment calls the policy to # estimate prior performance. In that case, the LSHF would throw a # Value Error. We pre-empt that here Q = np.zeros((self.actions_num)) #try : # self.LSH.kneighbors(np.append(s, 0)) #except ValueError: # return Q for a in range(self.actions_num): Q[a] = self.Q_tilda(s, a) return Q
import json import numpy as np import cPickle as pickle from nearpy import Engine from nearpy.hashes import RandomBinaryProjections from nearpy.storage import RedisStorage from redis import Redis dimension = 100 lshash = RandomBinaryProjections('DocHash', 12, rand_seed=123) redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=1)) engine = Engine(dimension, lshashes=[lshash], storage=redis_storage) with open("ids.json") as f: ids = json.load(f) docvecs = np.load("hndbow.docvecs.doctag_syn0.npy", mmap_mode='r') for i,id in enumerate(ids): vec = docvecs[i] # 1x100 nparray engine.store_vector(vec, id) redis_storage.store_hash_configuration(lshash)
def example2(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 20000 ########################################################## print 'Performing indexing with HashPermutations...' t0 = time.time() # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm.store_vector(v) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with HashPermutationMapper...' t0 = time.time() # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm2.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with HashPermutationMapper:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with mutliple binary hashes...' t0 = time.time() hashes = [] for k in range(20): hashes.append(RandomBinaryProjections('rbp_%d' % k, 10)) # Create engine engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_rbps.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % engine_rbps.candidate_count(query) results = engine_rbps.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash self.permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections( 'testPCABPHash', lsh_project_num, matrix) self.permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine( self.dimension, lshashes=[self.permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def update(self, person, feature): print feature v = map(float, feature.split(',')) epoch_time = long(time.time()) f_name = person + '_' + str(epoch_time) print f_name self.engine.store_vector(v, f_name) def query(self, person_feature): dists = [] scores = [] query = map(float, person_feature.split(',')) # print '\nNeighbour distances with mutliple binary hashes:' # print ' -> Candidate count is %d' % self.engine.candidate_count(query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] res = zip(dists, scores) res.sort(key=lambda t: t[1]) return res[:self.neighbour]
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
import json import numpy as np import cPickle as pickle from nearpy import Engine from nearpy.hashes import RandomBinaryProjections from nearpy.storage import RedisStorage from redis import Redis from ne import CosineSim dimension = 100 with open("hndbow.index2word", 'r') as f: index2words = json.load(f) wordvecs = np.load("hndbow.syn0.npy") redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=3)) lshash = RandomBinaryProjections('WordHash', 5, rand_seed=123) engine = Engine(dimension, distance=CosineSim(), lshashes=[lshash], storage=redis_storage) for i,w in enumerate(index2words): vec = wordvecs[i] # 1x100 nparray engine.store_vector(vec, w) redis_storage.store_hash_configuration(lshash)