def loadHashmap(self, feature_size, result_n): # Create redis storage adapter redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) pdb.set_trace() try: # Get hash config from redis config = redis_storage.load_hash_configuration('test') # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 0) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. nearest = NearestFilter(1000) #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) pdb.set_trace() self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def knn(data, k): assert k <= len( data ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k = k + 1 n, dimension = data.shape ind = [] dist = [] if (dimension < 10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N, dist, ind
def RunAnnNearpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) queryData = np.genfromtxt(self.dataset[1], delimiter=',') train, label = SplitTrainData(self.dataset) with totalTimer: # Get all the parameters. try: # Perform Approximate Nearest-Neighbors dimension = train.shape[1] rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) for i in range(len(train)): engine.store_vector(train[i], 'data_%d' % i) for i in range(len(queryData)): v = engine.neighbours(queryData[i]) except Exception as e: Log.Info(e) q.put(e) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20)
def index_in_text_engine(nid_gen, tfidf, lsh_projections, tfidf_is_dense=False): num_features = tfidf.shape[1] print("TF-IDF shape: " + str(tfidf.shape)) text_engine = Engine(num_features, lshashes=[lsh_projections], distance=CosineDistance()) st = time.time() row_idx = 0 for key in nid_gen: if tfidf_is_dense: dense_row = tfidf[row_idx] array = dense_row else: sparse_row = tfidf.getrow(row_idx) dense_row = sparse_row.todense() array = dense_row.A[0] row_idx += 1 text_engine.store_vector(array, key) et = time.time() print("Total index text: " + str((et - st))) return text_engine
def index_user_vectors(): #print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() #print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time()
def load_engine(sdf_files, feature_matrix, dimension): """ Function that converts the given sdf_files into instances of the sdf_class, then loads them into nearpy Engine. Parameters sdf_files: a list of sdf_files with their pathname from the current directory. Intended to be fed in from `find_sdf(root_dir)` feature_matrix: matrix of training data features to be loaded into engine dimension: dimensionality of the feature vectors used for LSH (here: number of cluster centers) Returns engine: instance of a nearpy engine with all of sdf_files loaded Sample Usage >>> engine = load_engine(sdf_files) """ #dimension here can be altered as well rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) count = 0 for index, file_ in enumerate(sdf_files): #print file_ if count % 100 == 0: print 'Converted %d files' % (count) converted = SDF(file_) converted.set_feature_vector(feature_matrix[index]) converted.add_to_nearpy_engine(engine) count += 1 return engine
def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])
def LSH(Layers, K): lsh_vectors = database[:, LSH_VECT_START_COL:] video_data = database[:, 0:5] num_rows, num_cols = lsh_vectors.shape dimension = num_cols rbp = list() for i in range(Layers): rbp.append(RandomBinaryProjections(str(i), K)) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=rbp) # Index 1000000 random vectors (set their data zo a unique string) for index in range(num_rows): v = lsh_vectors[index, :] meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \ + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4]) engine.store_vector(v, meta_data) printOutput(engine.storage.buckets) print 'stop'
def test_nearpy(X_train, y_train, X_test, k): # We are looking for the k closest neighbours nearest = NearestFilter(k) X_train_normalized = [] for i in range(len(X_train)): train_example = X_train[i] element = ((train_example / np.linalg.norm(train_example)).tolist(), y_train[i].tolist()) X_train_normalized.append(element) engine = Engine(X_train.shape[1], lshashes=[RandomBinaryProjections('default', 10)], distance=CosineDistance(), vector_filters=[nearest]) #perform hashing for train examples for train_example in X_train: engine.store_vector(train_example) labels = [] for test_example in X_test: neighbors = engine.neighbours(test_example) labels.append([ train_example[1] for train_example in X_train_normalized if set(neighbors[0][0]) == set(train_example[0]) ]) return labels
def __init__(self, num_features, projection_count=30): self.num_features = num_features #self.rbp = RandomDiscretizedProjections('default', projection_count, bin_width=100) self.rbp = RandomBinaryProjections('default', projection_count) #self.rbp = RandomBinaryProjectionTree('default', projection_count, 1) self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance())
def __init__(self, x): self.n, self.f = x.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = x[i, :] self.engine.store_vector(v, i)
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def fit(self, X): b = self.params['b'] self.n, self.f = X.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', b) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = np.squeeze(np.copy(X[i, :])) self.engine.store_vector(v, i)
def k_nn_lsh_2(k, word, decade_matrix, index_dict): num_rows = decade_matrix.get_shape()[0] print("the number of rows:" + str(num_rows)) rbp = RandomBinaryProjections('rbp', 256) engine = Engine(num_rows, lshashes=[rbp]) for i in range(num_rows): print(i) engine.store_vector(decade_matrix.getrow(i), "data_%d" % i) return engine.neighbours(word)
def __init__(self, dimension, n_bit, alpha): self.n_bit = n_bit self.dim = dimension self.alpha = alpha self.sample_space = 2**n_bit self.rbp = RandomBinaryProjections('rbp', self.n_bit) self.engine = Engine(dimension, lshashes=[self.rbp])
def build_index(self, X): f = X.shape[1] n = X.shape[0] rbp = RandomBinaryProjections('rbp', 32) engine = Engine(f, lshashes=[rbp]) for i in range(n): engine.store_vector(X[i], 'data_%d' % i) return engine
def __init__(self, hasher, number_of_tables=8, length_of_tables=32, bin_width= 1.0, match_thresh=0.2): """ :param hasher: @type hasher: Hasher """ LSHIndex.__init__(self, hasher, match_thresh=match_thresh) self.setName(number_of_tables=number_of_tables,length_of_tables=length_of_tables,match_thresh=match_thresh,bin_width=bin_width) self.tables = [None]*number_of_tables for i in range(number_of_tables): self.tables[i] = RandomDiscretizedProjections(str(i), length_of_tables, bin_width) self.engine = Engine(self.hasher.dims(), lshashes=self.tables, fetch_vector_filters=[NoVectorFilter()])
def __configure_calculator(self, point_list, point): # Dimension of our vector space self.__dimension__ = 2 # Create a random binary hash with 10 bits self.__rbp__ = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__]) self.set_searching_point_list(point_list) self.set_query_point(point)
def __init__(self, emb_path, feature='title'): self.emb_path = emb_path self.feature = feature self.data_df = None self.tfidf = Vectorizer(**get_tfidf_params()) self.fasttext_embedder = None self.fasttext_tfidf = None self.dimension = 300 rbp = RandomBinaryProjections('rbp', 2) self.engine = Engine(self.dimension, lshashes=[rbp]) pass
def __init__(self, data_points, sim_threshold=0.5, num_vectors=3): self.data_points = data_points self.point_num = self.data_points.shape[0] self.dimension = self.data_points.shape[1] - 1 # Create a random binary hash with . bits self.rbp = RandomBinaryProjections('rbp', num_vectors, rand_seed=42) self.engine = Engine( self.dimension, lshashes=[self.rbp], vector_filters=[DistanceThresholdFilter(1 - sim_threshold)]) for i in range(self.point_num): self.engine.store_vector(self.data_points[i, 1:], '%d' % i)
def _create_engine(self, k, lshashes=None): self.k_ = k self.engine_ = Engine(self.dimension_, lshashes, distance=self.dist_metric_, vector_filters=[NearestFilter(k)]) for i, feature in enumerate(self.featurized_): if self.transpose_: self.engine_.store_vector(feature.T, i) else: self.engine_.store_vector(feature, i)
def fit(self, X, y=None, hash="randbinary"): X = np.array(X) assert len(X.shape) == 2, "X not 2-rank" dimension = X.shape[-1] if hash == "randbinary": rbp = RandomBinaryProjections('rbp', 10) elif hash == "pcabinary": rbp = PCABinaryProjections('rbp', 10, training_set=X) self.engine = Engine(dimension, lshashes=[rbp]) index = 0 for x in X: self.engine.store_vector(x, str(index)) index += 1
def test_experiment_with_unibucket_1(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(10) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def startEngine(): archive = redis.StrictRedis(host='login-node03', port=6380) redis_storage = RedisStorage(archive) config = redis_storage.load_hash_configuration('pcahash') if not config: logging.error("LSHash not configured") sys.exit(0) #TODO: Gracefully exit # lshash = RandomBinaryProjections(None, None) lshash = PCABinaryProjections(None, None, None) lshash.apply_config(config) eng = Engine(num_pc * 454, lshashes=[lshash], storage=redis_storage) return eng
def data_for_layer(basic_path, layer_name, num_folds, experiment, projection_count, start_pc_component, end_pc_component): # Read datasets basic_path_layer = os.path.join(basic_path, layer_name) dataset_files = "ALOI_train_20400.h5" hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files), 'r') dataset_aloi = hd['dataset_1'] dataset_train_aloi, dataset_test_aloi = split_data_to_test_train( dataset_aloi, num_folds, experiment) del dataset_aloi transformer = TransformImagesPCA(n_components=500) transformer.learn_pcs(dataset_train_aloi) del dataset_train_aloi dataset_files = "Google_train_6675.h5" hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files), 'r') dataset_google = hd['dataset_1'] dataset_train_google, dataset_test_google = split_data_to_test_train( dataset_google, num_folds, experiment) del dataset_google transformer.learn_pcs(dataset_train_google) del dataset_train_google dataset_files = "Nexus_train_1180.h5" hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files), 'r') dataset = hd['dataset_1'] dataset_train, dataset_test = split_data_to_test_train( dataset, num_folds, experiment) del dataset transformer.learn_pcs(dataset_train) del dataset_train pc_test_nexus = transformer.transform( dataset_test)[:, start_pc_component:end_pc_component] pc_test_aloi = transformer.transform( dataset_test_aloi)[:, start_pc_component:end_pc_component] pc_test_google = transformer.transform( dataset_test_google)[:, start_pc_component:end_pc_component] # Find the LSH vectors rbp = RandomBinaryProjections('rbp', projection_count, rand_seed=723657345) engine = Engine(end_pc_component - start_pc_component, lshashes=[rbp]) pc_test_nexus = project_LSH(pc_test_nexus, rbp) pc_test_aloi = project_LSH(pc_test_aloi, rbp) pc_test_google = project_LSH(pc_test_google, rbp) return pc_test_nexus, pc_test_aloi, pc_test_google
def start(dataset, test_vector, num_nearest=5): # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(dataset.shape, lshashes=[rbp]) # Index 1000000 random vectors (set their data to a unique string) for i, v in dataset: engine.store_vector(v, 'data_%d' % i) # Get nearest neighbours N = engine.neighbours(test_vector)
def __init__(self, distanceMeasure="EuclideanDistance"): self.res_similar = ResnetSimilarity() dimension = 2048 rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(dimension, lshashes=[rbp]) if distanceMeasure == "EuclideanDistance": self.filehandler = open("hashed_objects/hashed_object_euclidean.pkl", 'rb') elif distanceMeasure == "Test": self.filehandler = open("hashed_objects/hashed_object_example.pkl", 'rb') else: self.filehandler = open("hashed_objects/hashed_object_Cosine.pkl", 'rb') self.engine = pickle.load(self.filehandler) self.filehandler.close() print("Hash Table Loaded")
def get_engine(self, vocab, vecs): logging.info('{} hash functions'.format(self.args.projections)) hashes = [ PCABinaryProjections('ne1v', self.args.projections, vecs[:1000, :].T) ] engine = Engine(vecs.shape[1], lshashes=hashes, distance=[], vector_filters=[]) for ind, vec in enumerate(vecs): if not ind % 100000: logging.info('{} words added to nearpy engine'.format(ind)) engine.store_vector(vec, ind) return engine
def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())