class TestRandomBinaryProjections(unittest.TestCase): def setUp(self): self.rbp = RandomBinaryProjections('testHash', 10) self.rbp.reset(100) def test_hash_format(self): h = self.rbp.hash_vector(numpy.random.randn(100)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 10) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic(self): x = numpy.random.randn(100) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0]) def test_hash_format_sparse(self): h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 10) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic_sparse(self): x = scipy.sparse.rand(100, 1, density=0.1) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
class TestRandomBinaryProjections(unittest.TestCase): def setUp(self): self.rbp = RandomBinaryProjections('testHash', 10) self.rbp.reset(100) def test_hash_format(self): h = self.rbp.hash_vector(numpy.random.randn(100)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 10) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic(self): x = numpy.random.randn(100) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0]) def test_hash_format_sparse(self): h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 10) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic_sparse(self): x = scipy.sparse.rand(100, 1, density=0.1) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
def build_environment(config): lsh = LSH_sumbeam() w2v = MyWord2Vec() w2v.load(config) lsh.w2v = w2v # combine top 20k noun and 20k adj into a single wordlist topn = config.getint('space','topn') words = w2v.model.vocab.keys() wordlist = WordList() wordlist.words = words wordlist.filter_frequency(w2v,topn) wordlist.build_index() # build a matrix matrix = lsh._list2matrix_w2v(wordlist,lsh.w2v) # build an engine dim = np.shape(matrix)[1] num_bits = 15 rbp = RandomBinaryProjections('rbp', num_bits) rbp.reset(dim) engine = lsh._build_rbp_permute_engine(matrix,rbp) num_permutation = 50 beam_size = 50 num_neighbour = 100 engine.build_permute_index(num_permutation,beam_size,num_neighbour) return lsh,engine,matrix,wordlist
def index_user_vectors(): print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds', (t1-t0)
def index_user_vectors(): #print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() #print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time()
def build_index_sumbeam(self,num_bits): # hash the original vector in matrxi1 and matrix2 into engine1 and engine2 self.dim = np.shape(self.matrix1)[1] rbp = RandomBinaryProjections('rbp', num_bits) rbp.reset(self.dim) self.rbp = rbp engine1 = self._build_rbp_permute_engine(self.matrix1,rbp) engine2 = self._build_rbp_permute_engine(self.matrix2,rbp) self.engine1 = engine1 self.engine2 = engine2
def generate_lsh_fn(self): self.lsh_fn = [] for i in range(self.L): rbp = RandomBinaryProjections('rbp', self.K) rbp.reset(self.dim) # def fn(x): # mm = mmh3.hash(rbp.hash_vector(x)[0]) # return mm % self.R def fn(x): return 1 self.lsh_fn.append(fn)
def __init__(self, *args): """ Initializing dictionary with reduced vectors which represents conferences's members. :param args[0] - data: Data class, it represents data from dblp.xml :param args[1] - dim: Output dimension for LSH. """ print('Initialization recommender...') self.data = args[0] self.reduced_conferences = {} rbp = RandomBinaryProjections('rbp', args[1]) rbp.reset(self.data.members_set.__len__()) cnt = 0
def test_hash_memory_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.memory.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
def test_hash_memory_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.memory.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
class DoubleEngine: def _build_rdp_engine(self,matrix,rdp,normals): # Dimension of our vector space dimension = np.shape(matrix)[1] n = np.shape(matrix)[0] # Create a random binary hash with 10 bits # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage()) rdp.vectors = normals for index in range(n): v = matrix[index] engine.store_vector(v, '%d' % index) return engine def process2(self,vectors1,vectors2,num_bit,bin_width): # build engine self.dimension = np.shape(vectors1)[1] self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width) self.rbp = RandomBinaryProjections('rbp',num_bit) self.rdp.reset(self.dimension) self.rbp.reset(self.dimension) self.normals = self.rdp.vectors self.rbp.normals = self.normals self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals) self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals) # create new key buckets1 = self.engine1.storage.buckets['rdp'] buckets2 = self.engine2.storage.buckets['rdp'] self.rbdp = {} print 'len of buckets1', len(buckets1) print 'len of buckets2', len(buckets2) keys_int1 = [] keys_int2 = [] for key in buckets1: ks = [int(x) for x in key.split('_')] keys_int1.append(ks) for key in buckets2: ks = [int(x) for x in key.split('_')] keys_int2.append(ks) for idx1,key1 in enumerate(buckets1): if idx1 % 100 == 0: logging.info('{} {}/{}'.format(key1,idx1,len(buckets1))) for idx2,key2 in enumerate(buckets2): ks1 = keys_int1[idx1] ks2 = keys_int2[idx2] new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))] new_key = ''.join(['1' if x>=0 else '0' for x in new_key]) if not new_key in self.rbdp: self.rbdp[new_key] = [] self.rbdp[new_key].append((key1,key2)) def build_permute_index(self,num_permutation,beam_size,hamming_beam_size): self.num_permutation = num_permutation self.hamming_beam_size = hamming_beam_size self.beam_size = beam_size self.projection_count = self.rbp.projection_count # add permutations self.permutations = [] for i in xrange(self.num_permutation): p = Permutation(self.projection_count) self.permutations.append(p) # convert current buckets to an array of bitarray buckets = self.rbdp original_keys = [] for key in buckets: ba = bitarray(key) original_keys.append(ba) # build permutation lists self.permuted_lists = [] i = 0 for p in self.permutations: logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations))) i+=1 permuted_list = [] for ba in original_keys: c = ba.copy() p.permute(c) permuted_list.append((c,ba)) # sort the list permuted_list = sorted(permuted_list) self.permuted_lists.append(permuted_list) def get_neighbour_keys(self,bucket_key,k): # O( np*beam*log(np*beam) ) # np = number of permutations # beam = self.beam_size # np * beam == 200 * 100 Still really fast query_key = bitarray(bucket_key) topk = set() for i in xrange(len(self.permutations)): p = self.permutations[i] plist = self.permuted_lists[i] candidates = p.search_revert(plist,query_key,self.beam_size) topk = topk.union(set(candidates)) topk = list(topk) topk = sorted(topk, key = lambda x : hamming_distance(x,query_key)) topk_bin = [x.to01() for x in topk[:k]] return topk_bin def n2(self,key1,key2,v): #return [(cos_dist,(idx1,idx2))] def matrix_list(engine,key): # return a matrix and a list of keys items = engine.storage.buckets['rdp'][key] m = [] l = [] for v,key in items: m.append(v) l.append(int(key)) m = np.array(m) return m,l m1,l1 = matrix_list(self.engine1,key1) m2,l2 = matrix_list(self.engine2,key2) len1 = len(l1) len2 = len(l2) # a . v av = np.dot(m1,v) av = np.repeat(av,len2).reshape(len1,len2) # b . v bv = np.dot(m2,v) bv = np.repeat(bv,len1).reshape(len2,len1).T # nominator = a.v + b.v nomi = av + bv # |v| nv = np.linalg.norm(v,2) # a.a aa = np.sum(m1*m1,axis = 1) aa = np.repeat(aa,len2).reshape(len1,len2) # b.b bb = np.sum(m2*m2,axis = 1) bb = np.repeat(bb,len1).reshape(len2,len1).T # a.b ab = np.dot(m1,m2.T) # denominator deno = np.sqrt(aa + bb + 2 * ab) * nv # distance matrix dism = nomi / deno dist = [] for i in xrange(len1): for j in xrange(len2): dis = dism[i,j] dist.append((dis,(l1[i],l2[j]))) return dist def neighbours2(self,v,n): # one important assumption: just have one hash method # Collect candidates from all buckets from all hashes candidates = [] direct_bucket_keys = self.rbp.hash_vector(v) # Get the neighbours of candidate_bucket_keys candidate_bucket_keys = [] for bucket_key in direct_bucket_keys: neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size) candidate_bucket_keys.extend(neighbour_keys) dists = [] for bucket_key in candidate_bucket_keys: comb = self.rbdp[bucket_key] print bucket_key, len(comb) for key1,key2 in comb: dist = self.n2(key1,key2,v) dists.extend(dist) dists = sorted(dists,key = lambda x: -x[0]) return dists[:n] # If there is no vector filter, just return list of candidates return dists
train_set = SingleLabelTextDataset('dataset/{}'.format(dataset), subset='train', bow_format=data_fmt, download=True) test_set = SingleLabelTextDataset('dataset/{}'.format(dataset), subset='test', bow_format=data_fmt, download=True) else: train_set = MultiLabelTextDataset('dataset/{}'.format(dataset), subset='train', bow_format=data_fmt, download=True) test_set = MultiLabelTextDataset('dataset/{}'.format(dataset), subset='test', bow_format=data_fmt, download=True) ######################################################################################################### def LSH_hash(bow): return [int(b) for b in lshash.hash_vector(bow.toarray().squeeze())[0]] with torch.no_grad(): prec_results = [] for nbits in [8, 16, 32, 64, 128]: lshash = RandomBinaryProjections('MyHash', nbits) lshash.reset(train_set.num_features()) # get hash code train_hash = train_set.df.bow.apply(LSH_hash) test_hash = test_set.df.bow.apply(LSH_hash) # convert hash to Tensor train_b = torch.Tensor(list(train_hash)).type(torch.cuda.ByteTensor) test_b = torch.Tensor(list(test_hash)).type(torch.cuda.ByteTensor) if single_label: train_y = torch.Tensor(list(train_set.df.label)) test_y = torch.Tensor(list(test_set.df.label)) else: train_y = torch.from_numpy(sparse.vstack(list(train_set.df.label)).toarray()) test_y = torch.from_numpy(sparse.vstack(list(test_set.df.label)).toarray())
import sys import pandas as pd import numpy as np from nearpy import Engine from nearpy.hashes import RandomBinaryProjections def update(d, val): if val in d: d[val] += 1 else: d[val] = 0 rbp = RandomBinaryProjections('rbp', 5) rbp.reset(1) #engine = Engine(10, lshashes=[rbp]) file = open(sys.argv[1]).readlines() df = pd.DataFrame() bigdict = dict() i = 0 for line in file: data = dict() triples = line.split(" ")[1:] for t in triples: update(data, rbp.hash_vector([float(t.split(",")[1])])[0]) # for p in range(0, 200-len(data.values())): # data[p]=0