Exemplo n.º 1
0
class TestRandomBinaryProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomBinaryProjections('testHash', 10)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
Exemplo n.º 2
0
class TestRandomBinaryProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomBinaryProjections('testHash', 10)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
Exemplo n.º 3
0
def build_environment(config):
    lsh = LSH_sumbeam()
    w2v = MyWord2Vec()
    w2v.load(config)
    lsh.w2v = w2v

    # combine top 20k noun and 20k adj into a single wordlist
    topn = config.getint('space','topn')
    words = w2v.model.vocab.keys()
    wordlist = WordList()
    wordlist.words = words
    wordlist.filter_frequency(w2v,topn)
    wordlist.build_index()

    # build a matrix
    matrix = lsh._list2matrix_w2v(wordlist,lsh.w2v)

    # build an engine
    dim = np.shape(matrix)[1]
    num_bits = 15
    rbp = RandomBinaryProjections('rbp', num_bits)
    rbp.reset(dim)    
    engine = lsh._build_rbp_permute_engine(matrix,rbp)
    num_permutation = 50
    beam_size = 50
    num_neighbour = 100
    engine.build_permute_index(num_permutation,beam_size,num_neighbour)
    
    return lsh,engine,matrix,wordlist
def index_user_vectors():
	
	print 'Performing indexing with HashPermutations...'
	
	global engine_perm 
	
	t0 = time.time()
	
	print k_dimen, d_dimen
	
	rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)
	
	rbp_perm.reset(k_dimen)
	
	# Create permutations meta-hash
	permutations = HashPermutations('permut')
	
	rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250}
	
        # Add rbp as child hash of permutations hash
	permutations.add_child_hash(rbp_perm, rbp_conf)
	
        # Create engine
        engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance())
    
	for u in user_vector:
		
		engine_perm.store_vector(user_vector[u], data=u)
		
	 # Then update permuted index
        permutations.build_permuted_index()
    
	t1 = time.time()
	
	print 'Indexing took %f seconds', (t1-t0)
def index_user_vectors():

    #print 'Performing indexing with HashPermutations...'

    global engine_perm

    t0 = time.time()

    #print k_dimen, d_dimen

    rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)

    rbp_perm.reset(k_dimen)

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(k_dimen,
                         lshashes=[permutations],
                         distance=CosineDistance())

    for u in user_vector:

        engine_perm.store_vector(user_vector[u], data=u)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
Exemplo n.º 6
0
 def build_index_sumbeam(self,num_bits):
     # hash the original vector in matrxi1 and matrix2 into engine1 and engine2
     self.dim = np.shape(self.matrix1)[1]
     rbp = RandomBinaryProjections('rbp', num_bits)
     rbp.reset(self.dim)
     self.rbp = rbp
 
     engine1 = self._build_rbp_permute_engine(self.matrix1,rbp)
     engine2 = self._build_rbp_permute_engine(self.matrix2,rbp)
     self.engine1 = engine1
     self.engine2 = engine2
Exemplo n.º 7
0
    def generate_lsh_fn(self):
        self.lsh_fn = []
        for i in range(self.L):
            rbp = RandomBinaryProjections('rbp', self.K)
            rbp.reset(self.dim)

            # def fn(x):
            #     mm = mmh3.hash(rbp.hash_vector(x)[0])
            #     return mm % self.R
            def fn(x):
                return 1
            self.lsh_fn.append(fn)
    def __init__(self, *args):
        """
        Initializing dictionary with reduced vectors which represents conferences's members.
        :param args[0] - data: Data class, it represents data from dblp.xml
        :param args[1] - dim: Output dimension for LSH.
        """
        print('Initialization recommender...')
        self.data = args[0]
        self.reduced_conferences = {}
        rbp = RandomBinaryProjections('rbp', args[1])
        rbp.reset(self.data.members_set.__len__())

        cnt = 0
Exemplo n.º 9
0
    def test_hash_memory_storage_rbp(self):
        hash1 = RandomBinaryProjections('testRBPHash', 10)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomBinaryProjections(None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRBPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
Exemplo n.º 10
0
    def test_hash_memory_storage_rbp(self):
        hash1 = RandomBinaryProjections('testRBPHash', 10)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomBinaryProjections(None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRBPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
Exemplo n.º 11
0
class DoubleEngine:

    def _build_rdp_engine(self,matrix,rdp,normals):
        # Dimension of our vector space
        dimension = np.shape(matrix)[1]
        n = np.shape(matrix)[0]
        # Create a random binary hash with 10 bits

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage())
        rdp.vectors = normals

        for index in range(n):
            v = matrix[index]
            engine.store_vector(v, '%d' % index)
            
        return engine
    
        

    def process2(self,vectors1,vectors2,num_bit,bin_width):
        
        # build engine
        self.dimension = np.shape(vectors1)[1]
        self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width)
        self.rbp = RandomBinaryProjections('rbp',num_bit)
        self.rdp.reset(self.dimension)
        self.rbp.reset(self.dimension)
        self.normals = self.rdp.vectors
        self.rbp.normals = self.normals
        self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals)
        self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals)
        
        # create new key
        buckets1 = self.engine1.storage.buckets['rdp']
        buckets2 = self.engine2.storage.buckets['rdp']
        
        self.rbdp = {}

        print 'len of buckets1', len(buckets1)
        print 'len of buckets2', len(buckets2)

        keys_int1 = []
        keys_int2 = []

        for key in buckets1:
            ks = [int(x) for x in key.split('_')]
            keys_int1.append(ks)

        for key in buckets2:
            ks = [int(x) for x in key.split('_')]
            keys_int2.append(ks)

        for idx1,key1 in enumerate(buckets1):
            if idx1 % 100 == 0:
                logging.info('{} {}/{}'.format(key1,idx1,len(buckets1)))
            for idx2,key2 in enumerate(buckets2):
                ks1 = keys_int1[idx1]
                ks2 = keys_int2[idx2]
                new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))]
                new_key = ''.join(['1' if x>=0 else '0' for x in new_key])
                if not new_key in self.rbdp:
                    self.rbdp[new_key] = []
                self.rbdp[new_key].append((key1,key2))
        
    def build_permute_index(self,num_permutation,beam_size,hamming_beam_size):
        self.num_permutation = num_permutation
        self.hamming_beam_size = hamming_beam_size
        self.beam_size = beam_size
        self.projection_count = self.rbp.projection_count
        
        # add permutations
        self.permutations = []
        for i in xrange(self.num_permutation):
            p = Permutation(self.projection_count)
            self.permutations.append(p)

        # convert current buckets to an array of bitarray
        buckets = self.rbdp
        original_keys = []
        for key in buckets:
            ba = bitarray(key)
            original_keys.append(ba)

        # build permutation lists
        self.permuted_lists = []
        i = 0
        for p in self.permutations:
            logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations)))
            i+=1
            permuted_list = []
            for ba in original_keys:
                c = ba.copy()
                p.permute(c)
                permuted_list.append((c,ba))
            # sort the list
            permuted_list = sorted(permuted_list)
            self.permuted_lists.append(permuted_list)
        

    def get_neighbour_keys(self,bucket_key,k):
        # O( np*beam*log(np*beam) )
        # np = number of permutations
        # beam = self.beam_size
        # np * beam == 200 * 100 Still really fast

        query_key = bitarray(bucket_key)
        topk = set()
        for i in xrange(len(self.permutations)):
            p = self.permutations[i]
            plist = self.permuted_lists[i]
            candidates = p.search_revert(plist,query_key,self.beam_size)
            topk = topk.union(set(candidates))
        topk = list(topk)
        topk = sorted(topk, key = lambda x : hamming_distance(x,query_key))
        topk_bin = [x.to01() for x in topk[:k]]
        return topk_bin

    def n2(self,key1,key2,v):
        #return [(cos_dist,(idx1,idx2))]
        def matrix_list(engine,key):
            # return a matrix and a list of keys
            items = engine.storage.buckets['rdp'][key]
            m = []
            l = []
            for v,key in items:
                m.append(v)
                l.append(int(key))
            m = np.array(m)    
            return m,l
        m1,l1 = matrix_list(self.engine1,key1)
        m2,l2 = matrix_list(self.engine2,key2)
        len1 = len(l1)
        len2 = len(l2)
        # a . v 
        av = np.dot(m1,v)
        av = np.repeat(av,len2).reshape(len1,len2)
        # b . v
        bv = np.dot(m2,v)
        bv = np.repeat(bv,len1).reshape(len2,len1).T
        # nominator = a.v + b.v
        nomi = av + bv
        # |v|
        nv = np.linalg.norm(v,2)
        # a.a
        aa = np.sum(m1*m1,axis = 1)
        aa = np.repeat(aa,len2).reshape(len1,len2)
        # b.b
        bb = np.sum(m2*m2,axis = 1)
        bb = np.repeat(bb,len1).reshape(len2,len1).T
        # a.b
        ab = np.dot(m1,m2.T)
        # denominator 
        deno = np.sqrt(aa + bb + 2 * ab) * nv
        # distance matrix 
        dism = nomi / deno
        dist = []
        for i in xrange(len1):
            for j in xrange(len2):
                dis = dism[i,j]
                dist.append((dis,(l1[i],l2[j])))
        return dist

    def neighbours2(self,v,n):
        # one important assumption: just have one hash method
        # Collect candidates from all buckets from all hashes
        candidates = []
        direct_bucket_keys = self.rbp.hash_vector(v)

        # Get the neighbours of candidate_bucket_keys
        candidate_bucket_keys = []
        
        for bucket_key in direct_bucket_keys:
            neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size)
            candidate_bucket_keys.extend(neighbour_keys)
        
        dists = []
        for bucket_key in candidate_bucket_keys:
            comb = self.rbdp[bucket_key]
            print bucket_key, len(comb)
            for key1,key2 in comb:
                dist = self.n2(key1,key2,v)
                dists.extend(dist)

        dists = sorted(dists,key = lambda x: -x[0])
        return dists[:n]
        # If there is no vector filter, just return list of candidates
        return dists
Exemplo n.º 12
0
    train_set = SingleLabelTextDataset('dataset/{}'.format(dataset), subset='train', bow_format=data_fmt, download=True)
    test_set = SingleLabelTextDataset('dataset/{}'.format(dataset), subset='test', bow_format=data_fmt, download=True)
else:
    train_set = MultiLabelTextDataset('dataset/{}'.format(dataset), subset='train', bow_format=data_fmt, download=True)
    test_set = MultiLabelTextDataset('dataset/{}'.format(dataset), subset='test', bow_format=data_fmt, download=True)
    
#########################################################################################################
def LSH_hash(bow):
    return [int(b) for b in lshash.hash_vector(bow.toarray().squeeze())[0]]

with torch.no_grad():
    prec_results = []
    
    for nbits in [8, 16, 32, 64, 128]:
        lshash = RandomBinaryProjections('MyHash', nbits)
        lshash.reset(train_set.num_features())

        # get hash code
        train_hash = train_set.df.bow.apply(LSH_hash)
        test_hash = test_set.df.bow.apply(LSH_hash)

        # convert hash to Tensor
        train_b = torch.Tensor(list(train_hash)).type(torch.cuda.ByteTensor)
        test_b = torch.Tensor(list(test_hash)).type(torch.cuda.ByteTensor)
        
        if single_label:
            train_y = torch.Tensor(list(train_set.df.label))
            test_y = torch.Tensor(list(test_set.df.label))
        else:
            train_y = torch.from_numpy(sparse.vstack(list(train_set.df.label)).toarray())
            test_y = torch.from_numpy(sparse.vstack(list(test_set.df.label)).toarray())
Exemplo n.º 13
0
import sys
import pandas as pd
import numpy as np
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections


def update(d, val):
    if val in d:
        d[val] += 1
    else:
        d[val] = 0


rbp = RandomBinaryProjections('rbp', 5)
rbp.reset(1)
#engine = Engine(10, lshashes=[rbp])

file = open(sys.argv[1]).readlines()

df = pd.DataFrame()
bigdict = dict()
i = 0
for line in file:
    data = dict()
    triples = line.split(" ")[1:]

    for t in triples:
        update(data, rbp.hash_vector([float(t.split(",")[1])])[0])
    # for p in range(0, 200-len(data.values())):
    #     data[p]=0