def example2(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 20000 ########################################################## print 'Performing indexing with HashPermutations...' t0 = time.time() # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm.store_vector(v) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with HashPermutationMapper...' t0 = time.time() # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm2.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with HashPermutationMapper:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with mutliple binary hashes...' t0 = time.time() hashes = [] for k in range(20): hashes.append(RandomBinaryProjections('rbp_%d' % k, 10)) # Create engine engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_rbps.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % engine_rbps.candidate_count(query) results = engine_rbps.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def query(self, person_list): dists = [] scores = [] for person in person_list: query = map(float, self.face_feature[person].split(',')) print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % self.engine.candidate_count(query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists] res = zip(dists, scores, t_num) res.sort(key = lambda t: t[1]) res1 = self.f7(res, person_list) return res1[:self.neighbour] def true_num(self, person): return self.ground_truth[person] def f7(self, zip_seq, person_list): seen = set() seen_add = seen.add return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
def example1(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 10000 print('Creating engines') # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('rbpt', 20, 20) # Create engine 1 engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance()) # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 20) # Create engine 2 engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance()) # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 20) rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine 3 engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine 3 engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) print('Indexing %d random vectors of dimension %d' % (POINTS, DIM)) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i, :] = nearpy.utils.utils.unitvec(v) engine.store_vector(v, i) engine_rbpt.store_vector(v, i) engine_perm.store_vector(v, i) engine_perm2.store_vector(v, i) print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys())) print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys())) print('Building permuted index for HashPermutations') # Then update permuted index permutations.build_permuted_index() print('Generate random data') # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 1 print('\nNeighbour distances with RandomBinaryProjectionTree:') print(' -> Candidate count is %d' % engine_rbpt.candidate_count(query)) results = engine_rbpt.neighbours(query) print_results(results) # Do random query on engine 2 print('\nNeighbour distances with RandomBinaryProjections:') print(' -> Candidate count is %d' % engine.candidate_count(query)) results = engine.neighbours(query) print_results(results) # Do random query on engine 3 print('\nNeighbour distances with HashPermutations:') print(' -> Candidate count is %d' % engine_perm.candidate_count(query)) results = engine_perm.neighbours(query) print_results(results) # Do random query on engine 4 print('\nNeighbour distances with HashPermutations2:') print(' -> Candidate count is %d' % engine_perm2.candidate_count(query)) results = engine_perm2.neighbours(query) print_results(results) # Real neighbours print('\nReal neighbour distances:') query = nearpy.utils.utils.unitvec(query) query = query.reshape((DIM, 1)) dists = CosineDistance().distance(matrix, query) dists = dists.reshape((-1, )) # dists = sorted(dists) dists_argsort = numpy.argsort(dists) results = [(None, d, dists[d]) for d in dists_argsort[:10]] print_results(results)
class ClusterAnalyser: def __init__(self): from multiprocessing import Pool self.resetClusters() self.TUNE_INTERVAL = 1000 self.ncnttot=0 self.ncntq=0 self.store_cnt = 20 self.store_join_cnt = 20 self.p = Pool(20) self.entropyLikelyhood = True self.tuneClusters = True self.cutters = [[0,None,'getRandomContractionsMinCut']] self.simgraphparams = dict(usedropout=False) self.max_runtime = 1200 self.start_time = None self.overrun = False self.min_lines_per_second = 20 def resetClusters(self): # Every new cluster gets an unique id which is the key for this dictionary self.clusters = {} self.next_cluster_id = FI_CLUSTER_ID_OFFSET if opt_lang == 'fi' else 0 # Locality Sensitive Hashing self.lsh_engine = Engine(vecs.dim, lshashes=[RandomBinaryProjections('rpb', HYPERPLANE_COUNT) for i in range(HASH_LAYERS)], distance=lsh_distance_func) # Returns closest clusters to a given sentence, in a sorted list of (distance, cluster) tuples. def query_clusters(query, idfs): doc_vec = document_to_vector(query.split(' '), idfs) if doc_vec is None: return None return sorted([(1 - doc_vec.dot(c.center) / c.norm, c) for id, c in self.clusters.iteritems()]) # look for nearest cluster def lookupNearest(self, doc_vec, keywords=None, similarity=None ): lowest_index = -1 ncnt = self.lsh_engine.candidate_count(doc_vec) self.ncnttot += ncnt self.ncntq += 1 if not similarity is None: nearest_neighbours = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec))) else: nearest_neighbours = list(filter(lambda x: filterKeywords(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec))) #nearest_neighbours = self.lsh_engine.neighbours(doc_vec) if len(nearest_neighbours) > 0: # get closest one from tuple (cluster vector, cluster index, distance) nn = min(nearest_neighbours, key=lambda x: (x[2]/self.clusters[x[1]].power)) #if nn[2] < (CLUSTER_THRESHOLD*self.clusters[nn[1]].power): lowest_index = nn[1] return lowest_index def initNewCluster(self, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time): c = makeNewCluster(self.next_cluster_id, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time) self.addCluster(c) def addCluster(self, c): self.lsh_engine.store_vector(c.center, self.next_cluster_id) self.clusters[self.next_cluster_id] = c self.next_cluster_id += 1 def tuneClustersCall(self): line = self.line deleted_clusters = [] print ('parallel preprocessing ... ') #parallel preprocessing dlist = list(self.clusters.iteritems()) params = [[self.line - self.TUNE_INTERVAL, self.entropyLikelyhood, self.cutters, self.simgraphparams]]*len(dlist) split_test_out = dict(self.p.map(doAnalyseSplit, zip(dlist, params))) print ('done') for c_idx, c in list(self.clusters.iteritems()): if c_idx in deleted_clusters: continue #print ([c_idx, c]) if c.last_update > line - self.TUNE_INTERVAL: if len(c.documents) > 10: if split_test_out[c_idx]['result']: a = split_test_out[c_idx]['a'] b = split_test_out[c_idx]['b'] probJoin = split_test_out[c_idx]['probJoin'] probSplit = split_test_out[c_idx]['probSplit'] c.documents = list(map(lambda x: x[0],a)) c.text_data = list(map(lambda x: x[1],a)) c.word_index = dict() for t in c.text_data: for w in list(filter(lambda x: len(x) > 3, t[0][0].split(' ')[2:])): c.word_index[w] = '' self.lsh_engine.delete_vector(c_idx) c.center = np.mean(c.documents, axis=0) c.norm = np.linalg.norm(c.center) c.updatePower() self.lsh_engine.store_vector(c.center, c_idx) # copy time parameters for now print ("Split cluster %d into %d and %d %f < %f" % (c_idx, len(a), len(b), probJoin, probSplit)) self.initNewCluster(list(map(lambda x: x[0],b)), list(map(lambda x: x[1][0],b)), c.last_update, c.created_at, c.lang, list(map(lambda x: x[1][1],b))) if self.store_cnt > 0: pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_split_cases_%d.pckl'%self.store_cnt,'wb')) self.store_cnt -= 1 if len(c.documents) > 30: # Test merge with random nearest nearest_neighbour_clusters = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], c.word_index), self.lsh_engine.neighbours(c.center)))#self.lsh_engine.neighbours(c.center) nearest_neighbour_clusters.sort(key=lambda x: x[2]) maxrnd = min(len(nearest_neighbour_clusters),6) if len(nearest_neighbour_clusters) > 1: ann, bnn = random.sample(nearest_neighbour_clusters[:maxrnd], 2) a= zip(self.clusters[ann[1]].documents, self.clusters[ann[1]].text_data) b= zip(self.clusters[bnn[1]].documents, self.clusters[bnn[1]].text_data) if len(a) < 20 and (not self.entropyLikelyhood): #or len(a) > 500 : continue if len(b) < 20 and (not self.entropyLikelyhood): #or len(b) > 500 : continue if self.clusters[ann[1]].lang != self.clusters[bnn[1]].lang: continue if self.entropyLikelyhood: c = makeNewCluster(self.next_cluster_id, list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b))) probJoin = computeEntropyLikelyhood(c, idfs) wa = len(a)/(float(len(a))+len(b)) probSplit = (wa*computeEntropyLikelyhood(self.clusters[ann[1]],idfs)+(1-wa)*computeEntropyLikelyhood(self.clusters[bnn[1]],idfs))+(wa*math.log(wa)/math.log(2)+(1-wa)*math.log((1-wa))/math.log(2))+random.random() else: probJoin = computeNormalLikelyhood(a+b) probSplit = computeNormalLikelyhood(a)*computeNormalLikelyhood(b) if probJoin > probSplit: deleted_clusters.append(ann[1]) deleted_clusters.append(bnn[1]) print ("Join clusters %d (%d) and %d (%d) %f > %f" % (ann[1], len(a), bnn[1], len(b), probJoin, probSplit)) if self.store_join_cnt > 0: pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_join_cases_%d.pckl'%self.store_join_cnt,'wb')) self.store_join_cnt -= 1 if self.entropyLikelyhood: self.addCluster(c) else: self.initNewCluster(list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b))) self.lsh_engine.delete_vector(ann[1]) self.clusters.pop(ann[1]) self.lsh_engine.delete_vector(bnn[1]) self.clusters.pop(bnn[1]) def purgeClusters(self): line = self.line to_be_removed = [] for k, c in self.clusters.iteritems(): if line - c.last_update > (100000 * len(c.documents)) and len(c.documents) < 10: to_be_removed.append((k, c.center)) for t in to_be_removed: self.lsh_engine.delete_vector(t[0]) self.clusters.pop(t[0]) if len(to_be_removed) > 0: print("Removed %d stagnant clusters" % len(to_be_removed)) def calcGrowthRate(self): line = self.line tweet_time = self.tweet_time time_since_last_growth = self.time_since_last_growth for id, c in self.clusters.iteritems(): #if (c.created_at < 1405555200000): # 17/07/2014 00:00:00 # continue c.calculateGrowthAndSentiment() ## calculate growth for first 12h #if len(c.hourly_growth_rate) < 12: #growth_rate = (len(c.text_data) - c.last_size) / float(time_since_last_growth) * 1000 * 60 * 60 #if len(c.hourly_growth_rate) == 0: #c.first_growth_time = tweet_time #c.hourly_growth_rate.append(growth_rate) ## calculate sentiment for new tweets #if len(c.documents) > c.last_size: #cluster_vector = np.mean(c.documents[c.last_size:], axis=0) #sentiment = getSentiment(cluster_vector) #else: #sentiment = 0 #c.hourly_sentiment.append(sentiment) ## calculate total sentiment so far #sentiment = getSentiment(np.mean(c.documents, axis=0)) #c.hourly_accum_sentiment.append(sentiment) #c.last_size = len(c.text_data) #c.hourly_keywords.append(cluster_exporter.get_keywords(c, idfs)[:3])#['three','random','words'] ## print quickly growing ones with high enough entropy ##if growth_rate < 10: #continue #entropy = cluster_exporter.calculate_cluster_entropy(c) #if entropy < ENTROPY_THRESHOLD: #continue #print('Quickly growing cluster %d: %d tweets, %d tweets/h, entropy %.2f\n' % (id, len(c.text_data), int(growth_rate), entropy)) #print('\n'.join(list(map(lambda x: x[0],random.sample(c.text_data, min(len(c.text), 8)))))) #print('\n\n') # Every line in the input file should start with a timestamp in ms and id of document, # followed by the whole document, all separated with spaces and without newlines. # # Note: minimum word frequency is often implemented by the vector model already def construct_clusters(self, filename, from_line=0, from_date=None, to_date=None,idfs=None, lang=None): self.start_time = time.time() if lang != 'ru' and lang != 'fi': print("Lang must be 'ru' or 'fi'") return tweet_file = open(filename) try: self.line = 0 # performance counter self.last_print_line = 0 self.last_print_time = time.time() # used for calculating hourly growth in tweet time self.last_growth_calc = 0 self.tweet_time = 0 self.tweet_time_notz = datetime.utcfromtimestamp(0) for twlineesc in tweet_file: if time.time() - self.start_time > self.max_runtime: self.overrun = True break twline = twlineesc.decode('unicode-escape').encode('utf-8') if len(twline) < 2: continue twsplit = twline.split(',') try: unix_tweet_time =int(time.mktime(datetime.strptime(twsplit[0], '%a %b %d %X +0000 %Y').timetuple()) * 1000) except: print (twline) print (twsplit[0]) raise Exception() tweet = " ".join([str(unix_tweet_time),twsplit[1],twsplit[4]]) self.line += 1 if self.line < from_line: continue if self.tuneClusters: if self.line % self.TUNE_INTERVAL == 0: #pr.disable() self.tuneClustersCall() #pr.enable() # save periodically if False:#self.line % 1000000 == 0 and self.line != 0: save_results(filename + '_' + str(self.line)) # remove really old clusters with a small amount of documents if self.line % 100000 == 0: self.purgeClusters() # print status if self.line % 1000 == 0: #pr.disable() new_time = time.time() lps = int((self.line - self.last_print_line) / (new_time - self.last_print_time)) print("Line: %d, Date: %s, Clusters: %d, %d lines/s AVG candidates: %d" % (self.line, self.tweet_time_notz, len(self.clusters), lps, int(self.ncnttot/(self.ncntq+0.0000001)))) #if int((self.line - self.last_print_line) / (new_time - self.last_print_time)) < 50: # s = StringIO.StringIO() # sortby = 'cumulative' # ps = pstats.Stats(pr, stream=s).sort_stats(sortby) # ps.print_stats() # print (s.getvalue()) self.last_print_line = self.line self.last_print_time = new_time self.ncnttot = 0 self.ncntq = 0 if time.time() - self.start_time > self.max_runtime or lps < self.min_lines_per_second: self.overrun = True break #pr.enable() # calculate growth rate #self.time_since_last_growth = self.tweet_time - self.last_growth_calc #if self.time_since_last_growth > 1000 * 60 * 60: # self.last_growth_calc = self.tweet_time # self.calcGrowthRate() self.tweet_time = unix_tweet_time tweet_parts = tweet.strip().split(' ') #try: # self.tweet_time = int(tweet_parts[0]) #except ValueError: # print('Invalid document on line %d: %s' % (self.line, tweet)) # continue self.tweet_time_notz = datetime.utcfromtimestamp(self.tweet_time * 0.001) tweet_time_utc = utc.localize(self.tweet_time_notz) if from_date is not None and tweet_time_utc < from_date: continue if to_date is not None and tweet_time_utc > to_date: break # TEMP ignore gameinsight spam and short tweets if len(tweet_parts) < 6 or tweet.find('gameinsight') != -1: continue # allocate tweet to cluster doc_vec = document_to_vector(tweet_parts[2:], idfs) if doc_vec is None: continue keywords = list(filter(lambda x: len(x) > 4, tweet.strip().split(' ')[2:])) #ignore short tweets if len(keywords) < 6: continue lowest_index = self.lookupNearest(doc_vec, keywords, similarity=True) if lowest_index != -1: c = self.clusters[lowest_index] c.appendTweet(doc_vec, [[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time], self.line) #c.documents.append(doc_vec) #c.text_data.append([[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time]) #c.last_update = self.line # update the cluster center if the cluster is small if len(c.documents) > 0: if len(c.documents) < 5: self.lsh_engine.delete_vector(lowest_index) c.center = np.mean(c.documents, axis=0) c.norm = np.linalg.norm(c.center) self.lsh_engine.store_vector(c.center, lowest_index) else: if len(c.documents) < 100: c.power = np.mean(np.std(c.documents, axis=0)) else: # no cluster found, construct new one self.initNewCluster([doc_vec], [[tweet.strip(), twsplit[3], twsplit[2]]], self.line, self.tweet_time, lang,[self.tweet_time]) except KeyboardInterrupt: print("Line: %d Clusters: %d" % (self.line, len(self.clusters))) print("Cancelled") self.p.close() self.p.join()
def example2(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 20000 ########################################################## print 'Performing indexing with HashPermutations...' t0 = time.time() # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm.store_vector(v) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds' % (t1 - t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1, DIM)) dists = CosineDistance().distance_matrix(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with HashPermutationMapper...' t0 = time.time() # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm2.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1 - t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with HashPermutationMapper:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1, DIM)) dists = CosineDistance().distance_matrix(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with mutliple binary hashes...' t0 = time.time() hashes = [] for k in range(20): hashes.append(RandomBinaryProjections('rbp_%d' % k, 10)) # Create engine engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS, DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_rbps.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1 - t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % engine_rbps.candidate_count(query) results = engine_rbps.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1, DIM)) dists = CosineDistance().distance_matrix(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print dists[:10]
def worker(A, start): # starttime=datetime.datetime.now() # endtime=datetime.datetime.now() # timee=endtime-starttime timee = 0 num = 0 for j in xrange(kkkk): k1 = 0 #for circ in range(1000): # starttime=datetime.datetime.now() #lshtruple=lsh.query(newcomparearrt[j+start*kkkk],1) #print type(engine) lshtruple = engine.neighbours(newcomparearrt[j + start * kkkk]) #print lshtruple # endtime=datetime.datetime.now() # timee=timee+(endtime-starttime).seconds # if lshtruple: # print lshtruple[0] for f in xrange(len(CC)): #print CC[f] if lshtruple: if (tuple(CC[f]).__eq__(lshtruple[0][0])): k1 = f break #print k1 length3 = len(clusresult[k1]) temp = clusresult[k1] #..................................................................................... # lsh1=LSHash(6,3) # #print temp # ff=0 # for ff in xrange(length3): # lsh1.index(temp[ff]) # starttime1=datetime.datetime.now() # if lsh1.query(newcomparearrt[j],1): # num=num+1 # endtime1=datetime.datetime.now() # timee=timee+(endtime1-starttime1).seconds # del lsh1 #..................................................................................... #nearpy rbp1 = RandomBinaryProjections('rbp2', 10) DIM1 = 3 engine1 = Engine(DIM1, lshashes=[rbp1], distance=CosineDistance(), vector_filters=[NearestFilter(1)]) for ff in xrange(length3): engine1.store_vector(temp[ff], ff) if engine1.candidate_count(newcomparearrt[j]): num = num + 1 #print num #del engine results = engine1.neighbours(newcomparearrt[j]) # print results del engine1 #........................................................................... A.append(num)
def example1(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 10000 print 'Creating engines' # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('rbpt', 20, 20) # Create engine 1 engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance()) # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 20) # Create engine 2 engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance()) # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 20) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine 3 engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine 3 engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) print 'Indexing %d random vectors of dimension %d' % (POINTS, DIM) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine.store_vector(v) engine_rbpt.store_vector(v) engine_perm.store_vector(v) engine_perm2.store_vector(v) print 'Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()) print 'Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()) print 'Building permuted index for HashPermutations' # Then update permuted index permutations.build_permuted_index() print 'Generate random data' # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 1 print '\nNeighbour distances with RandomBinaryProjectionTree:' print ' -> Candidate count is %d' % engine_rbpt.candidate_count(query) results = engine_rbpt.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 2 print '\nNeighbour distances with RandomBinaryProjections:' print ' -> Candidate count is %d' % engine.candidate_count(query) results = engine.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 4 print '\nNeighbour distances with HashPermutations2:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) print "Candidate count = %d" % self.engine.candidate_count(x) print "Result size = %d" % len(n) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config( self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def query(self, person_list): dists = [] scores = [] for person in person_list: query = map(float, self.face_feature[person].split(',')) print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % self.engine.candidate_count( query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists] res = zip(dists, scores, t_num) res.sort(key=lambda t: t[1]) res1 = self.f7(res, person_list) return res1[:self.neighbour] def true_num(self, person): return self.ground_truth[person] def f7(self, zip_seq, person_list): seen = set() seen_add = seen.add return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list) ]
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) print "Candidate count = %d" % self.engine.candidate_count(x) print "Result size = %d" % len(n) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])