def test_random_discretized_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rdp = RandomDiscretizedProjections('rdp', 1, 0.01) nearest = NearestFilter(10) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall1, precision1, searchtime1)) # Then get recall and precision for one 4-dim random hash rdp = RandomDiscretizedProjections('rdp', 2, 0.2) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) result = exp.perform_experiment([engine]) recall2 = result[0][0] precision2 = result[0][1] searchtime2 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall2, precision2, searchtime2)) # Many things are random here, but the precision should increase # with dimension self.assertTrue(precision2 > precision1)
def __init__(self, level): self.feature_vector = {} self.parentLevel = int(level) print(self.parentLevel) self.hashes = dict() self.rdp = RandomDiscretizedProjections('rdp', 5, 6, rand_seed=98412194) self.rdp.reset(self.parentLevel)
def __init__(self, stage, bucket): self.parentLevel = 5 self.rdp = RandomDiscretizedProjections('rdp', stage, bucket, rand_seed=98412194) self.rdp.reset(5) self.hash_dict = {} self.data = defaultdict(list)
def createLSH(dimensions): nearest = NearestFilter(5) bin_width = 10 projections = 50 rbp = RandomDiscretizedProjections('rbp', projections, bin_width) rbp2 = RandomDiscretizedProjections('rbp2', projections, bin_width) rbp3 = RandomDiscretizedProjections('rbp3', projections, bin_width) rbp4 = RandomDiscretizedProjections('rbp4', projections, bin_width) engine = Engine(dimensions, lshashes=[rbp, rbp2, rbp3, rbp4], vector_filters=[nearest]) return engine
class featureLsh(): def __init__(self, stage, bucket): self.parentLevel = 5 self.rdp = RandomDiscretizedProjections('rdp', stage, bucket, rand_seed=98412194) self.rdp.reset(5) self.hash_dict = {} self.data = defaultdict(list) def get_hash(self, vector): h = self.rdp.hash_vector(vector)[0] return h def set_hash(self, header): self.hash_dict["program"] = "program" for i in header: key_vec = i.split("_") vec = [] for j in key_vec: vec.append(int(j)) newkey = self.get_hash(vec) self.hash_dict[i] = newkey print("Setting hash done. Running lsh...") def update_dict(self, dicts): print("updating_dict") for dict in dicts: newdict = {} for key, value in dict.items(): newkey = self.hash_dict[key] if newkey == "program": newdict[newkey] = value else: if not newkey in newdict: if type(value) == str: newdict[newkey] = float(value) else: if isnan(value): newdict[newkey] = 0 else: newdict[newkey] = float(value) else: if type(value) == str: newdict[newkey] += float(value) else: if not isnan(value): newdict[newkey] += float(value) for key, value in newdict.items(): self.data[key].append(value)
class TestRandomDiscretizedProjections(unittest.TestCase): def setUp(self): self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1) self.rbp.reset(100) def test_hash_format(self): h = self.rbp.hash_vector(numpy.random.randn(100)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) def test_hash_deterministic(self): x = numpy.random.randn(100) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0]) def test_hash_format_sparse(self): h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) def test_hash_deterministic_sparse(self): x = scipy.sparse.rand(100, 1, density=0.1) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
class TestRandomDiscretizedProjections(unittest.TestCase): def setUp(self): self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1) self.rbp.reset(100) def test_hash_format(self): h = self.rbp.hash_vector(numpy.random.randn(100)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) def test_hash_deterministic(self): x = numpy.random.randn(100) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0]) def test_hash_format_sparse(self): h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) def test_hash_deterministic_sparse(self): x = scipy.sparse.rand(100, 1, density=0.1) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
def __init__( self, data: np.ndarray = None, labels: np.ndarray = None, k: int = 100, projections: int = 3, bin_width: int = 10, tables: int = 3, verbose: bool = True, dummy: bool = False, ): self.k = k self.projections = projections self.tables = tables if not dummy: if data is None and labels is None: raise Exception('data and labels must be numpy.ndarray when not using dummy indexer') t0 = time.time() self.engine = HashEngine( vectors=data, labels=labels, lshashes=[RandomDiscretizedProjections(f'rbp_{i}', projections, bin_width=bin_width) for i in range(tables)], k=k, verbose=verbose, ) self.build_time = time.time() - t0
def __init__(self, hasher, number_of_tables=8, length_of_tables=32, bin_width= 1.0, match_thresh=0.2): """ :param hasher: @type hasher: Hasher """ LSHIndex.__init__(self, hasher, match_thresh=match_thresh) self.setName(number_of_tables=number_of_tables,length_of_tables=length_of_tables,match_thresh=match_thresh,bin_width=bin_width) self.tables = [None]*number_of_tables for i in range(number_of_tables): self.tables[i] = RandomDiscretizedProjections(str(i), length_of_tables, bin_width) self.engine = Engine(self.hasher.dims(), lshashes=self.tables, fetch_vector_filters=[NoVectorFilter()])
def test_hash_memory_storage_rdp(self): hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomDiscretizedProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testRDPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
def build_content_sim_relation_text(network, signatures): def get_nid_gen(signatures): for nid, sig in signatures: yield nid docs = [] for nid, e in signatures: docs.append(' '.join(e)) # this may become redundant if we exploit the store characteristics tfidf = da.get_tfidf_docs(docs) # rbp = RandomBinaryProjections('default', 1000) lsh_projections = RandomDiscretizedProjections('rnddiscretized', 1000, 2) nid_gen = get_nid_gen(signatures) text_engine = index_in_text_engine(nid_gen, tfidf, lsh_projections) nid_gen = get_nid_gen(signatures) create_sim_graph_text(nid_gen, network, text_engine, tfidf, Relation.CONTENT_SIM)
def process2(self,vectors1,vectors2,num_bit,bin_width): # build engine self.dimension = np.shape(vectors1)[1] self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width) self.rbp = RandomBinaryProjections('rbp',num_bit) self.rdp.reset(self.dimension) self.rbp.reset(self.dimension) self.normals = self.rdp.vectors self.rbp.normals = self.normals self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals) self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals) # create new key buckets1 = self.engine1.storage.buckets['rdp'] buckets2 = self.engine2.storage.buckets['rdp'] self.rbdp = {} print 'len of buckets1', len(buckets1) print 'len of buckets2', len(buckets2) keys_int1 = [] keys_int2 = [] for key in buckets1: ks = [int(x) for x in key.split('_')] keys_int1.append(ks) for key in buckets2: ks = [int(x) for x in key.split('_')] keys_int2.append(ks) for idx1,key1 in enumerate(buckets1): if idx1 % 100 == 0: logging.info('{} {}/{}'.format(key1,idx1,len(buckets1))) for idx2,key2 in enumerate(buckets2): ks1 = keys_int1[idx1] ks2 = keys_int2[idx2] new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))] new_key = ''.join(['1' if x>=0 else '0' for x in new_key]) if not new_key in self.rbdp: self.rbdp[new_key] = [] self.rbdp[new_key].append((key1,key2))
def test_hash_memory_storage_rdp(self): hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomDiscretizedProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testRDPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.bin_width, hash2.bin_width) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
def setUp(self): self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1) self.rbp.reset(100)
import sys import hashlib import antlr4 from antlr4 import ParseTreeWalker, ParserRuleContext from template.Template2Lexer import Template2Lexer from template.Template2Listener import Template2Listener from template.Template2Parser import Template2Parser from nearpy.hashes import RandomBinaryProjections, RandomDiscretizedProjections MAX_PATH_LENGTH = 100 stacks = dict() #rdp = RandomBinaryProjections('rbp', 100, rand_seed=98412194) rdp = RandomDiscretizedProjections('rdp', 10, 1000, rand_seed=98412194) rdp.reset(MAX_PATH_LENGTH) def getHash(vector): # if len(vector) < MAX_PATH_LENGTH: # vector = vector + (MAX_PATH_LENGTH-len(vector))*[0] h = rdp.hash_vector(vector)[0] return h def update(d, entry): if entry in d: d[entry] += 1 else:
def setUp(self): self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1) self.rbp.reset(100)
class FeatureBuilder(Template2Listener): def __init__(self, level): self.feature_vector = {} self.parentLevel = int(level) print(self.parentLevel) self.hashes = dict() self.rdp = RandomDiscretizedProjections('rdp', 5, 6, rand_seed=98412194) self.rdp.reset(self.parentLevel) def getHash(self, vector): if len(vector) < self.parentLevel: vector = vector + (self.parentLevel - len(vector)) * [0] h = self.rdp.hash_vector(vector)[0] # h = '_'.join([str(x) for x in vector]) return h def getParents(self, ctx): curLevel = 0 curNode = ctx path = [] while curNode is not None and curLevel < self.parentLevel: #path.append(curNode.getRuleIndex()) nodename = curNode.__class__.__name__ path.append(fixed_hashes[nodename]) curLevel += 1 curNode = curNode.parentCtx return path def update_vector(self, ctx): if self.parentLevel <= 1: name = type(ctx).__name__ if ctx.parentCtx is not None: parentName = type(ctx.parentCtx).__name__ feature_name = 't_' + parentName + '_' + name if feature_name not in self.feature_vector: self.feature_vector[feature_name] = 0 self.feature_vector[feature_name] += 1 else: path=self.getParents(ctx) name=self.getHash(path) if name not in self.feature_vector: self.feature_vector[name] = 0 self.feature_vector[name] += 1 def enterAddop(self, ctx): self.update_vector(ctx) def enterAnd(self, ctx): self.update_vector(ctx) def enterArray(self, ctx): self.update_vector(ctx) def enterArray_access(self, ctx): self.update_vector(ctx) def enterAssign(self, ctx): self.update_vector(ctx) def enterBlock(self, ctx): self.update_vector(ctx) def enterBrackets(self, ctx): self.update_vector(ctx) def enterData(self, ctx): self.update_vector(ctx) def enterDecl(self, ctx): self.update_vector(ctx) def enterPrimitive(self, ctx): self.update_vector(ctx) def enterNumber(self, ctx): self.update_vector(ctx) def enterDtype(self, ctx): self.update_vector(ctx) def enterVector(self, ctx): self.update_vector(ctx) def enterDims(self, ctx): self.update_vector(ctx) def enterVectorDIMS(self, ctx): self.update_vector(ctx) def enterLimits(self, ctx): self.update_vector(ctx) def enterPrior(self, ctx): self.update_vector(ctx) def enterParam(self, ctx): self.update_vector(ctx) def enterParams(self, ctx): self.update_vector(ctx) def enterDistexpr(self, ctx): self.update_vector(ctx) def enterLoopcomp(self, ctx): self.update_vector(ctx) def enterFor_loop(self, ctx): self.update_vector(ctx) def enterIf_stmt(self, ctx): self.update_vector(ctx) def enterElse_blk(self, ctx): self.update_vector(ctx) def enterFunction_call(self, ctx): self.update_vector(ctx) def enterFparam(self, ctx): self.update_vector(ctx) def enterFparams(self, ctx): self.update_vector(ctx) def enterReturn_or_param_type(self, ctx): self.update_vector(ctx) def enterFunction_decl(self, ctx): self.update_vector(ctx) def enterTransformedparam(self, ctx): self.update_vector(ctx) def enterTransformeddata(self, ctx): self.update_vector(ctx) def enterGeneratedquantities(self, ctx): self.update_vector(ctx) def enterFunctions(self, ctx): self.update_vector(ctx) def enterVal(self, ctx): self.update_vector(ctx) def enterDivop(self, ctx): self.update_vector(ctx) def enterString(self, ctx): self.update_vector(ctx) def enterExponop(self, ctx): self.update_vector(ctx) def enterMinusop(self, ctx): self.update_vector(ctx) def enterLt(self, ctx): self.update_vector(ctx) def enterUnary(self, ctx): self.update_vector(ctx) def enterEq(self, ctx): self.update_vector(ctx) def enterGt(self, ctx): self.update_vector(ctx) def enterRef(self, ctx): self.update_vector(ctx) def enterGeq(self, ctx): self.update_vector(ctx) def enterMulop(self, ctx): self.update_vector(ctx) def enterFunction(self, ctx): self.update_vector(ctx) def enterVecmulop(self, ctx): self.update_vector(ctx) def enterNe(self, ctx): self.update_vector(ctx) def enterLeq(self, ctx): self.update_vector(ctx) def enterTranspose(self, ctx): self.update_vector(ctx) def enterVecdivop(self, ctx): self.update_vector(ctx) def enterTernary(self, ctx): self.update_vector(ctx) def enterSubset(self, ctx): self.update_vector(ctx) def enterObserve(self, ctx): self.update_vector(ctx) def enterStatement(self, ctx): self.update_vector(ctx) def enterQuery(self, ctx): self.update_vector(ctx) def enterTemplate(self, ctx): self.update_vector(ctx)
# We are looking for the N closest neighbours N = 20 nearest = NearestFilter(N) # We will fill this array with all the engines we want to test engines = [] print 'Creating engines...' # We are going to test these bin widths bin_widths = [0.01 * x for x in range(1, 5)] # Create engines for all configurations for bin_width in bin_widths: # Use four random 1-dim discretized projections rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width) rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width) rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width) rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width) #ub1 = UniBucket('uni') # Create engine with this configuration #engine = Engine(dimension, lshashes=[rdp1, rdp2, rdp3, rdp4], # vector_filters=[unique, nearest]) engine = Engine(dimension, lshashes=[rdp1, rdp2, rdp3, rdp4], vector_filters=[nearest]) # Add engine to list of engines to evaluate engines.append(engine)
class DoubleEngine: def _build_rdp_engine(self,matrix,rdp,normals): # Dimension of our vector space dimension = np.shape(matrix)[1] n = np.shape(matrix)[0] # Create a random binary hash with 10 bits # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage()) rdp.vectors = normals for index in range(n): v = matrix[index] engine.store_vector(v, '%d' % index) return engine def process2(self,vectors1,vectors2,num_bit,bin_width): # build engine self.dimension = np.shape(vectors1)[1] self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width) self.rbp = RandomBinaryProjections('rbp',num_bit) self.rdp.reset(self.dimension) self.rbp.reset(self.dimension) self.normals = self.rdp.vectors self.rbp.normals = self.normals self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals) self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals) # create new key buckets1 = self.engine1.storage.buckets['rdp'] buckets2 = self.engine2.storage.buckets['rdp'] self.rbdp = {} print 'len of buckets1', len(buckets1) print 'len of buckets2', len(buckets2) keys_int1 = [] keys_int2 = [] for key in buckets1: ks = [int(x) for x in key.split('_')] keys_int1.append(ks) for key in buckets2: ks = [int(x) for x in key.split('_')] keys_int2.append(ks) for idx1,key1 in enumerate(buckets1): if idx1 % 100 == 0: logging.info('{} {}/{}'.format(key1,idx1,len(buckets1))) for idx2,key2 in enumerate(buckets2): ks1 = keys_int1[idx1] ks2 = keys_int2[idx2] new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))] new_key = ''.join(['1' if x>=0 else '0' for x in new_key]) if not new_key in self.rbdp: self.rbdp[new_key] = [] self.rbdp[new_key].append((key1,key2)) def build_permute_index(self,num_permutation,beam_size,hamming_beam_size): self.num_permutation = num_permutation self.hamming_beam_size = hamming_beam_size self.beam_size = beam_size self.projection_count = self.rbp.projection_count # add permutations self.permutations = [] for i in xrange(self.num_permutation): p = Permutation(self.projection_count) self.permutations.append(p) # convert current buckets to an array of bitarray buckets = self.rbdp original_keys = [] for key in buckets: ba = bitarray(key) original_keys.append(ba) # build permutation lists self.permuted_lists = [] i = 0 for p in self.permutations: logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations))) i+=1 permuted_list = [] for ba in original_keys: c = ba.copy() p.permute(c) permuted_list.append((c,ba)) # sort the list permuted_list = sorted(permuted_list) self.permuted_lists.append(permuted_list) def get_neighbour_keys(self,bucket_key,k): # O( np*beam*log(np*beam) ) # np = number of permutations # beam = self.beam_size # np * beam == 200 * 100 Still really fast query_key = bitarray(bucket_key) topk = set() for i in xrange(len(self.permutations)): p = self.permutations[i] plist = self.permuted_lists[i] candidates = p.search_revert(plist,query_key,self.beam_size) topk = topk.union(set(candidates)) topk = list(topk) topk = sorted(topk, key = lambda x : hamming_distance(x,query_key)) topk_bin = [x.to01() for x in topk[:k]] return topk_bin def n2(self,key1,key2,v): #return [(cos_dist,(idx1,idx2))] def matrix_list(engine,key): # return a matrix and a list of keys items = engine.storage.buckets['rdp'][key] m = [] l = [] for v,key in items: m.append(v) l.append(int(key)) m = np.array(m) return m,l m1,l1 = matrix_list(self.engine1,key1) m2,l2 = matrix_list(self.engine2,key2) len1 = len(l1) len2 = len(l2) # a . v av = np.dot(m1,v) av = np.repeat(av,len2).reshape(len1,len2) # b . v bv = np.dot(m2,v) bv = np.repeat(bv,len1).reshape(len2,len1).T # nominator = a.v + b.v nomi = av + bv # |v| nv = np.linalg.norm(v,2) # a.a aa = np.sum(m1*m1,axis = 1) aa = np.repeat(aa,len2).reshape(len1,len2) # b.b bb = np.sum(m2*m2,axis = 1) bb = np.repeat(bb,len1).reshape(len2,len1).T # a.b ab = np.dot(m1,m2.T) # denominator deno = np.sqrt(aa + bb + 2 * ab) * nv # distance matrix dism = nomi / deno dist = [] for i in xrange(len1): for j in xrange(len2): dis = dism[i,j] dist.append((dis,(l1[i],l2[j]))) return dist def neighbours2(self,v,n): # one important assumption: just have one hash method # Collect candidates from all buckets from all hashes candidates = [] direct_bucket_keys = self.rbp.hash_vector(v) # Get the neighbours of candidate_bucket_keys candidate_bucket_keys = [] for bucket_key in direct_bucket_keys: neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size) candidate_bucket_keys.extend(neighbour_keys) dists = [] for bucket_key in candidate_bucket_keys: comb = self.rbdp[bucket_key] print bucket_key, len(comb) for key1,key2 in comb: dist = self.n2(key1,key2,v) dists.extend(dist) dists = sorted(dists,key = lambda x: -x[0]) return dists[:n] # If there is no vector filter, just return list of candidates return dists