예제 #1
0
class TestRandomDiscretizedProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
예제 #2
0
class TestRandomDiscretizedProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
예제 #3
0
파일: featurelsh.py 프로젝트: zhekunz2/c4pp
class featureLsh():
    def __init__(self, stage, bucket):
        self.parentLevel = 5
        self.rdp = RandomDiscretizedProjections('rdp',
                                                stage,
                                                bucket,
                                                rand_seed=98412194)
        self.rdp.reset(5)
        self.hash_dict = {}
        self.data = defaultdict(list)

    def get_hash(self, vector):
        h = self.rdp.hash_vector(vector)[0]
        return h

    def set_hash(self, header):
        self.hash_dict["program"] = "program"
        for i in header:
            key_vec = i.split("_")
            vec = []
            for j in key_vec:
                vec.append(int(j))
            newkey = self.get_hash(vec)
            self.hash_dict[i] = newkey
        print("Setting hash done. Running lsh...")

    def update_dict(self, dicts):
        print("updating_dict")
        for dict in dicts:
            newdict = {}
            for key, value in dict.items():
                newkey = self.hash_dict[key]
                if newkey == "program":
                    newdict[newkey] = value
                else:
                    if not newkey in newdict:
                        if type(value) == str:
                            newdict[newkey] = float(value)
                        else:
                            if isnan(value):
                                newdict[newkey] = 0
                            else:
                                newdict[newkey] = float(value)
                    else:
                        if type(value) == str:
                            newdict[newkey] += float(value)
                        else:
                            if not isnan(value):
                                newdict[newkey] += float(value)
            for key, value in newdict.items():
                self.data[key].append(value)
예제 #4
0
    def test_hash_memory_storage_rdp(self):
        hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomDiscretizedProjections(None, None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRDPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.bin_width, hash2.bin_width)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
예제 #5
0
    def test_hash_memory_storage_rdp(self):
        hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomDiscretizedProjections(None, None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRDPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.bin_width, hash2.bin_width)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
예제 #6
0
파일: path.py 프로젝트: zhekunz2/c4pp
import sys
import hashlib
import antlr4
from antlr4 import ParseTreeWalker, ParserRuleContext

from template.Template2Lexer import Template2Lexer
from template.Template2Listener import Template2Listener
from template.Template2Parser import Template2Parser
from nearpy.hashes import RandomBinaryProjections, RandomDiscretizedProjections

MAX_PATH_LENGTH = 100

stacks = dict()
#rdp = RandomBinaryProjections('rbp', 100, rand_seed=98412194)
rdp = RandomDiscretizedProjections('rdp', 10, 1000, rand_seed=98412194)
rdp.reset(MAX_PATH_LENGTH)


def getHash(vector):
    # if len(vector) < MAX_PATH_LENGTH:
    #     vector = vector + (MAX_PATH_LENGTH-len(vector))*[0]
    h = rdp.hash_vector(vector)[0]

    return h


def update(d, entry):
    if entry in d:
        d[entry] += 1
    else:
        d[entry] = 1
예제 #7
0
class FeatureBuilder(Template2Listener):

    def __init__(self, level):
        self.feature_vector = {}
        self.parentLevel = int(level)
        print(self.parentLevel)
        self.hashes = dict()

        self.rdp = RandomDiscretizedProjections('rdp', 5, 6, rand_seed=98412194)
        self.rdp.reset(self.parentLevel)

    def getHash(self, vector):
        if len(vector) < self.parentLevel:
            vector = vector + (self.parentLevel - len(vector)) * [0]
        h = self.rdp.hash_vector(vector)[0]
        # h = '_'.join([str(x) for x in vector])

        return h

    def getParents(self, ctx):
        curLevel = 0
        curNode = ctx
        path = []
        while curNode is not None and curLevel < self.parentLevel:
            #path.append(curNode.getRuleIndex())
            nodename = curNode.__class__.__name__
            path.append(fixed_hashes[nodename])
            curLevel += 1
            curNode = curNode.parentCtx
        return path

    def update_vector(self, ctx):
        if self.parentLevel <= 1:
            name = type(ctx).__name__
            if ctx.parentCtx is not None:
                parentName = type(ctx.parentCtx).__name__
                feature_name = 't_' + parentName + '_' + name
                if feature_name not in self.feature_vector:
                    self.feature_vector[feature_name] = 0
                self.feature_vector[feature_name] += 1
        else:
            path=self.getParents(ctx)
            name=self.getHash(path)

        if name not in self.feature_vector:
            self.feature_vector[name] = 0
        self.feature_vector[name] += 1

    def enterAddop(self, ctx):
        self.update_vector(ctx)

    def enterAnd(self, ctx):
        self.update_vector(ctx)

    def enterArray(self, ctx):
        self.update_vector(ctx)

    def enterArray_access(self, ctx):
        self.update_vector(ctx)

    def enterAssign(self, ctx):
        self.update_vector(ctx)

    def enterBlock(self, ctx):
        self.update_vector(ctx)

    def enterBrackets(self, ctx):
        self.update_vector(ctx)

    def enterData(self, ctx):
        self.update_vector(ctx)

    def enterDecl(self, ctx):
        self.update_vector(ctx)

    def enterPrimitive(self, ctx):
        self.update_vector(ctx)

    def enterNumber(self, ctx):
        self.update_vector(ctx)

    def enterDtype(self, ctx):
        self.update_vector(ctx)

    def enterVector(self, ctx):
        self.update_vector(ctx)

    def enterDims(self, ctx):
        self.update_vector(ctx)

    def enterVectorDIMS(self, ctx):
        self.update_vector(ctx)

    def enterLimits(self, ctx):
        self.update_vector(ctx)

    def enterPrior(self, ctx):
        self.update_vector(ctx)

    def enterParam(self, ctx):
        self.update_vector(ctx)

    def enterParams(self, ctx):
        self.update_vector(ctx)

    def enterDistexpr(self, ctx):
        self.update_vector(ctx)

    def enterLoopcomp(self, ctx):
        self.update_vector(ctx)

    def enterFor_loop(self, ctx):
        self.update_vector(ctx)

    def enterIf_stmt(self, ctx):
        self.update_vector(ctx)

    def enterElse_blk(self, ctx):
        self.update_vector(ctx)

    def enterFunction_call(self, ctx):
        self.update_vector(ctx)

    def enterFparam(self, ctx):
        self.update_vector(ctx)

    def enterFparams(self, ctx):
        self.update_vector(ctx)

    def enterReturn_or_param_type(self, ctx):
        self.update_vector(ctx)

    def enterFunction_decl(self, ctx):
        self.update_vector(ctx)

    def enterTransformedparam(self, ctx):
        self.update_vector(ctx)

    def enterTransformeddata(self, ctx):
        self.update_vector(ctx)

    def enterGeneratedquantities(self, ctx):
        self.update_vector(ctx)

    def enterFunctions(self, ctx):
        self.update_vector(ctx)

    def enterVal(self, ctx):
        self.update_vector(ctx)

    def enterDivop(self, ctx):
        self.update_vector(ctx)

    def enterString(self, ctx):
        self.update_vector(ctx)

    def enterExponop(self, ctx):
        self.update_vector(ctx)

    def enterMinusop(self, ctx):
        self.update_vector(ctx)

    def enterLt(self, ctx):
        self.update_vector(ctx)

    def enterUnary(self, ctx):
        self.update_vector(ctx)

    def enterEq(self, ctx):
        self.update_vector(ctx)

    def enterGt(self, ctx):
        self.update_vector(ctx)

    def enterRef(self, ctx):
        self.update_vector(ctx)

    def enterGeq(self, ctx):
        self.update_vector(ctx)

    def enterMulop(self, ctx):
        self.update_vector(ctx)

    def enterFunction(self, ctx):
        self.update_vector(ctx)

    def enterVecmulop(self, ctx):
        self.update_vector(ctx)

    def enterNe(self, ctx):
        self.update_vector(ctx)

    def enterLeq(self, ctx):
        self.update_vector(ctx)

    def enterTranspose(self, ctx):
        self.update_vector(ctx)

    def enterVecdivop(self, ctx):
        self.update_vector(ctx)

    def enterTernary(self, ctx):
        self.update_vector(ctx)

    def enterSubset(self, ctx):
        self.update_vector(ctx)

    def enterObserve(self, ctx):
        self.update_vector(ctx)

    def enterStatement(self, ctx):
        self.update_vector(ctx)

    def enterQuery(self, ctx):
        self.update_vector(ctx)

    def enterTemplate(self, ctx):
        self.update_vector(ctx)
예제 #8
0
class DoubleEngine:

    def _build_rdp_engine(self,matrix,rdp,normals):
        # Dimension of our vector space
        dimension = np.shape(matrix)[1]
        n = np.shape(matrix)[0]
        # Create a random binary hash with 10 bits

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage())
        rdp.vectors = normals

        for index in range(n):
            v = matrix[index]
            engine.store_vector(v, '%d' % index)
            
        return engine
    
        

    def process2(self,vectors1,vectors2,num_bit,bin_width):
        
        # build engine
        self.dimension = np.shape(vectors1)[1]
        self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width)
        self.rbp = RandomBinaryProjections('rbp',num_bit)
        self.rdp.reset(self.dimension)
        self.rbp.reset(self.dimension)
        self.normals = self.rdp.vectors
        self.rbp.normals = self.normals
        self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals)
        self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals)
        
        # create new key
        buckets1 = self.engine1.storage.buckets['rdp']
        buckets2 = self.engine2.storage.buckets['rdp']
        
        self.rbdp = {}

        print 'len of buckets1', len(buckets1)
        print 'len of buckets2', len(buckets2)

        keys_int1 = []
        keys_int2 = []

        for key in buckets1:
            ks = [int(x) for x in key.split('_')]
            keys_int1.append(ks)

        for key in buckets2:
            ks = [int(x) for x in key.split('_')]
            keys_int2.append(ks)

        for idx1,key1 in enumerate(buckets1):
            if idx1 % 100 == 0:
                logging.info('{} {}/{}'.format(key1,idx1,len(buckets1)))
            for idx2,key2 in enumerate(buckets2):
                ks1 = keys_int1[idx1]
                ks2 = keys_int2[idx2]
                new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))]
                new_key = ''.join(['1' if x>=0 else '0' for x in new_key])
                if not new_key in self.rbdp:
                    self.rbdp[new_key] = []
                self.rbdp[new_key].append((key1,key2))
        
    def build_permute_index(self,num_permutation,beam_size,hamming_beam_size):
        self.num_permutation = num_permutation
        self.hamming_beam_size = hamming_beam_size
        self.beam_size = beam_size
        self.projection_count = self.rbp.projection_count
        
        # add permutations
        self.permutations = []
        for i in xrange(self.num_permutation):
            p = Permutation(self.projection_count)
            self.permutations.append(p)

        # convert current buckets to an array of bitarray
        buckets = self.rbdp
        original_keys = []
        for key in buckets:
            ba = bitarray(key)
            original_keys.append(ba)

        # build permutation lists
        self.permuted_lists = []
        i = 0
        for p in self.permutations:
            logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations)))
            i+=1
            permuted_list = []
            for ba in original_keys:
                c = ba.copy()
                p.permute(c)
                permuted_list.append((c,ba))
            # sort the list
            permuted_list = sorted(permuted_list)
            self.permuted_lists.append(permuted_list)
        

    def get_neighbour_keys(self,bucket_key,k):
        # O( np*beam*log(np*beam) )
        # np = number of permutations
        # beam = self.beam_size
        # np * beam == 200 * 100 Still really fast

        query_key = bitarray(bucket_key)
        topk = set()
        for i in xrange(len(self.permutations)):
            p = self.permutations[i]
            plist = self.permuted_lists[i]
            candidates = p.search_revert(plist,query_key,self.beam_size)
            topk = topk.union(set(candidates))
        topk = list(topk)
        topk = sorted(topk, key = lambda x : hamming_distance(x,query_key))
        topk_bin = [x.to01() for x in topk[:k]]
        return topk_bin

    def n2(self,key1,key2,v):
        #return [(cos_dist,(idx1,idx2))]
        def matrix_list(engine,key):
            # return a matrix and a list of keys
            items = engine.storage.buckets['rdp'][key]
            m = []
            l = []
            for v,key in items:
                m.append(v)
                l.append(int(key))
            m = np.array(m)    
            return m,l
        m1,l1 = matrix_list(self.engine1,key1)
        m2,l2 = matrix_list(self.engine2,key2)
        len1 = len(l1)
        len2 = len(l2)
        # a . v 
        av = np.dot(m1,v)
        av = np.repeat(av,len2).reshape(len1,len2)
        # b . v
        bv = np.dot(m2,v)
        bv = np.repeat(bv,len1).reshape(len2,len1).T
        # nominator = a.v + b.v
        nomi = av + bv
        # |v|
        nv = np.linalg.norm(v,2)
        # a.a
        aa = np.sum(m1*m1,axis = 1)
        aa = np.repeat(aa,len2).reshape(len1,len2)
        # b.b
        bb = np.sum(m2*m2,axis = 1)
        bb = np.repeat(bb,len1).reshape(len2,len1).T
        # a.b
        ab = np.dot(m1,m2.T)
        # denominator 
        deno = np.sqrt(aa + bb + 2 * ab) * nv
        # distance matrix 
        dism = nomi / deno
        dist = []
        for i in xrange(len1):
            for j in xrange(len2):
                dis = dism[i,j]
                dist.append((dis,(l1[i],l2[j])))
        return dist

    def neighbours2(self,v,n):
        # one important assumption: just have one hash method
        # Collect candidates from all buckets from all hashes
        candidates = []
        direct_bucket_keys = self.rbp.hash_vector(v)

        # Get the neighbours of candidate_bucket_keys
        candidate_bucket_keys = []
        
        for bucket_key in direct_bucket_keys:
            neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size)
            candidate_bucket_keys.extend(neighbour_keys)
        
        dists = []
        for bucket_key in candidate_bucket_keys:
            comb = self.rbdp[bucket_key]
            print bucket_key, len(comb)
            for key1,key2 in comb:
                dist = self.n2(key1,key2,v)
                dists.extend(dist)

        dists = sorted(dists,key = lambda x: -x[0])
        return dists[:n]
        # If there is no vector filter, just return list of candidates
        return dists