Exemplo n.º 1
0
def example2():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 20000

    ##########################################################

    print 'Performing indexing with HashPermutations...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 14)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm.store_vector(v)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with HashPermutationMapper...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm2.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutationMapper:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with mutliple binary hashes...'
    t0 = time.time()

    hashes = []
    for k in range(20):
        hashes.append(RandomBinaryProjections('rbp_%d' % k, 10))

    # Create engine
    engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_rbps.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with mutliple binary hashes:'
    print '  -> Candidate count is %d' % engine_rbps.candidate_count(query)
    results = engine_rbps.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]
Exemplo n.º 2
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1 

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)
 
    def query(self, person_list):
        dists = []
        scores = []
        for person in person_list:
            query = map(float, self.face_feature[person].split(','))
            print '\nNeighbour distances with mutliple binary hashes:'
            print '  -> Candidate count is %d' % self.engine.candidate_count(query)
            results = self.engine.neighbours(query)
            dists = dists + [x[1] for x in results]
            scores = scores + [x[2] for x in results]
        t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists]
        res = zip(dists, scores, t_num)
        res.sort(key = lambda t: t[1])
        res1 = self.f7(res, person_list)
        return res1[:self.neighbour]

    def true_num(self, person):
        return self.ground_truth[person]

    def f7(self, zip_seq, person_list):
        seen = set()
        seen_add = seen.add
        return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
Exemplo n.º 3
0
def example1():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 10000

    print('Creating engines')

    # We want 12 projections, 20 results at least
    rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

    # Create engine 1
    engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

    # Create binary hash as child hash
    rbp = RandomBinaryProjections('rbp1', 20)

    # Create engine 2
    engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 20)
    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine 3
    engine_perm = Engine(DIM,
                         lshashes=[permutations],
                         distance=CosineDistance())

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine 3
    engine_perm2 = Engine(DIM,
                          lshashes=[permutations2],
                          distance=CosineDistance())

    print('Indexing %d random vectors of dimension %d' % (POINTS, DIM))

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i, :] = nearpy.utils.utils.unitvec(v)
        engine.store_vector(v, i)
        engine_rbpt.store_vector(v, i)
        engine_perm.store_vector(v, i)
        engine_perm2.store_vector(v, i)

    print('Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()))
    print('Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()))

    print('Building permuted index for HashPermutations')

    # Then update permuted index
    permutations.build_permuted_index()

    print('Generate random data')

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 1
    print('\nNeighbour distances with RandomBinaryProjectionTree:')
    print('  -> Candidate count is %d' % engine_rbpt.candidate_count(query))
    results = engine_rbpt.neighbours(query)
    print_results(results)

    # Do random query on engine 2
    print('\nNeighbour distances with RandomBinaryProjections:')
    print('  -> Candidate count is %d' % engine.candidate_count(query))
    results = engine.neighbours(query)
    print_results(results)

    # Do random query on engine 3
    print('\nNeighbour distances with HashPermutations:')
    print('  -> Candidate count is %d' % engine_perm.candidate_count(query))
    results = engine_perm.neighbours(query)
    print_results(results)

    # Do random query on engine 4
    print('\nNeighbour distances with HashPermutations2:')
    print('  -> Candidate count is %d' % engine_perm2.candidate_count(query))
    results = engine_perm2.neighbours(query)
    print_results(results)

    # Real neighbours
    print('\nReal neighbour distances:')
    query = nearpy.utils.utils.unitvec(query)
    query = query.reshape((DIM, 1))
    dists = CosineDistance().distance(matrix, query)
    dists = dists.reshape((-1, ))
    # dists = sorted(dists)

    dists_argsort = numpy.argsort(dists)

    results = [(None, d, dists[d]) for d in dists_argsort[:10]]
    print_results(results)
Exemplo n.º 4
0
class ClusterAnalyser:
    def __init__(self):
       from multiprocessing import Pool
       self.resetClusters()
       self.TUNE_INTERVAL = 1000
       self.ncnttot=0
       self.ncntq=0
       self.store_cnt = 20
       self.store_join_cnt = 20
       self.p = Pool(20)
       self.entropyLikelyhood = True
       self.tuneClusters = True
       self.cutters = [[0,None,'getRandomContractionsMinCut']]
       self.simgraphparams = dict(usedropout=False)
       self.max_runtime = 1200
       self.start_time = None
       self.overrun = False
       self.min_lines_per_second = 20
        
    def resetClusters(self):
        # Every new cluster gets an unique id which is the key for this dictionary
        self.clusters = {}
        self.next_cluster_id = FI_CLUSTER_ID_OFFSET if opt_lang == 'fi' else 0
        # Locality Sensitive Hashing
        self.lsh_engine = Engine(vecs.dim, lshashes=[RandomBinaryProjections('rpb', HYPERPLANE_COUNT) for i in range(HASH_LAYERS)], distance=lsh_distance_func)

    # Returns closest clusters to a given sentence, in a sorted list of (distance, cluster) tuples.
    def query_clusters(query, idfs):

        doc_vec = document_to_vector(query.split(' '), idfs)

        if doc_vec is None:
            return None

        return sorted([(1 - doc_vec.dot(c.center) / c.norm, c) for id, c in self.clusters.iteritems()])


    # look for nearest cluster
    def lookupNearest(self, doc_vec, keywords=None, similarity=None ):
        lowest_index = -1
        ncnt = self.lsh_engine.candidate_count(doc_vec)
        self.ncnttot += ncnt
        self.ncntq += 1
        
        if not similarity is None:
            nearest_neighbours = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec)))
        else:
            nearest_neighbours = list(filter(lambda x: filterKeywords(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec)))
        #nearest_neighbours = self.lsh_engine.neighbours(doc_vec)

        
        if len(nearest_neighbours) > 0:
            # get closest one from tuple (cluster vector, cluster index, distance)
            nn = min(nearest_neighbours, key=lambda x: (x[2]/self.clusters[x[1]].power))

            #if nn[2] < (CLUSTER_THRESHOLD*self.clusters[nn[1]].power):
            lowest_index = nn[1]
        return lowest_index
        




    def initNewCluster(self, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time):
        c = makeNewCluster(self.next_cluster_id, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time)
        self.addCluster(c)
        
    def addCluster(self, c):
        self.lsh_engine.store_vector(c.center, self.next_cluster_id)
        self.clusters[self.next_cluster_id] = c
        self.next_cluster_id += 1        
 


    def tuneClustersCall(self):
        line = self.line
        deleted_clusters = []
        print ('parallel preprocessing ... ')
        #parallel preprocessing 
        dlist = list(self.clusters.iteritems())
        params = [[self.line - self.TUNE_INTERVAL, self.entropyLikelyhood, self.cutters, self.simgraphparams]]*len(dlist)
        split_test_out = dict(self.p.map(doAnalyseSplit, zip(dlist, params)))
        

        print ('done')
        for c_idx, c in list(self.clusters.iteritems()): 
            if c_idx in deleted_clusters:
                continue
            #print ([c_idx, c])        
            if c.last_update > line - self.TUNE_INTERVAL:
                if len(c.documents) > 10:  
                    
                    if split_test_out[c_idx]['result']:
                        a = split_test_out[c_idx]['a']
                        b = split_test_out[c_idx]['b']
                        probJoin = split_test_out[c_idx]['probJoin']
                        probSplit = split_test_out[c_idx]['probSplit']
                        c.documents = list(map(lambda x: x[0],a))
                        c.text_data = list(map(lambda x: x[1],a))
                        c.word_index = dict()
                        for t in c.text_data:
                            for w in list(filter(lambda x: len(x) > 3, t[0][0].split(' ')[2:])):
                                c.word_index[w] = ''
                        
                        self.lsh_engine.delete_vector(c_idx)
                        c.center = np.mean(c.documents, axis=0)
                        c.norm   = np.linalg.norm(c.center)
                        c.updatePower()
                        self.lsh_engine.store_vector(c.center, c_idx)
                        # copy time parameters for now
                        print ("Split cluster %d into %d and %d  %f < %f" % (c_idx, len(a), len(b), probJoin, probSplit))
                        self.initNewCluster(list(map(lambda x: x[0],b)), list(map(lambda x: x[1][0],b)), c.last_update, c.created_at, c.lang, list(map(lambda x: x[1][1],b)))
                        if self.store_cnt > 0:
                            pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_split_cases_%d.pckl'%self.store_cnt,'wb'))
                            self.store_cnt -= 1
                        
            if len(c.documents) > 30:      
                # Test merge with random nearest
                nearest_neighbour_clusters = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], c.word_index), self.lsh_engine.neighbours(c.center)))#self.lsh_engine.neighbours(c.center)
                nearest_neighbour_clusters.sort(key=lambda x: x[2])
                maxrnd = min(len(nearest_neighbour_clusters),6)
                if len(nearest_neighbour_clusters) > 1:
                    ann, bnn = random.sample(nearest_neighbour_clusters[:maxrnd], 2)
                    
                    a= zip(self.clusters[ann[1]].documents, self.clusters[ann[1]].text_data)
                    b= zip(self.clusters[bnn[1]].documents, self.clusters[bnn[1]].text_data)
                    if len(a) < 20 and (not self.entropyLikelyhood): #or len(a) > 500 :
                        continue
                    if len(b) < 20 and (not self.entropyLikelyhood):  #or len(b) > 500 :
                        continue
                    if self.clusters[ann[1]].lang != self.clusters[bnn[1]].lang:
                        continue
                        
                    if self.entropyLikelyhood:
                        c = makeNewCluster(self.next_cluster_id, list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b)))
                        probJoin = computeEntropyLikelyhood(c, idfs)
                        wa = len(a)/(float(len(a))+len(b))
                        probSplit = (wa*computeEntropyLikelyhood(self.clusters[ann[1]],idfs)+(1-wa)*computeEntropyLikelyhood(self.clusters[bnn[1]],idfs))+(wa*math.log(wa)/math.log(2)+(1-wa)*math.log((1-wa))/math.log(2))+random.random()                        
                    else:
                        probJoin = computeNormalLikelyhood(a+b)
                        probSplit = computeNormalLikelyhood(a)*computeNormalLikelyhood(b)
                    if probJoin > probSplit:
                         deleted_clusters.append(ann[1])
                         deleted_clusters.append(bnn[1])
                         print ("Join clusters %d (%d) and %d (%d) %f > %f" % (ann[1], len(a), bnn[1], len(b), probJoin, probSplit))
                         if self.store_join_cnt > 0:
                             pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_join_cases_%d.pckl'%self.store_join_cnt,'wb'))
                             self.store_join_cnt -= 1
                         if self.entropyLikelyhood:
                             self.addCluster(c)
                         else:
                             self.initNewCluster(list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b)))
                         self.lsh_engine.delete_vector(ann[1])
                         self.clusters.pop(ann[1])
                         self.lsh_engine.delete_vector(bnn[1])
                         self.clusters.pop(bnn[1])
                                 
                                 
                  
                                
    def purgeClusters(self):
        line = self.line
        to_be_removed = []
        for k, c in self.clusters.iteritems():
            if line - c.last_update > (100000 * len(c.documents)) and len(c.documents) < 10:
                to_be_removed.append((k, c.center))

        for t in to_be_removed:
            self.lsh_engine.delete_vector(t[0])
            self.clusters.pop(t[0])

        if len(to_be_removed) > 0:
            print("Removed %d stagnant clusters" % len(to_be_removed))

    def calcGrowthRate(self):
        line = self.line
        tweet_time = self.tweet_time
        time_since_last_growth = self.time_since_last_growth
        for id, c in self.clusters.iteritems():
            #if (c.created_at < 1405555200000): # 17/07/2014 00:00:00
            #    continue

            c.calculateGrowthAndSentiment()
            
            ## calculate growth for first 12h
            #if len(c.hourly_growth_rate) < 12:
                #growth_rate = (len(c.text_data) - c.last_size) / float(time_since_last_growth) * 1000 * 60 * 60
                #if len(c.hourly_growth_rate) == 0:
                    #c.first_growth_time = tweet_time

                #c.hourly_growth_rate.append(growth_rate)

                ## calculate sentiment for new tweets
                #if len(c.documents) > c.last_size:
                    #cluster_vector = np.mean(c.documents[c.last_size:], axis=0)
                    #sentiment = getSentiment(cluster_vector)
                #else:
                    #sentiment = 0

                #c.hourly_sentiment.append(sentiment)

                ## calculate total sentiment so far
                #sentiment = getSentiment(np.mean(c.documents, axis=0))
                #c.hourly_accum_sentiment.append(sentiment)

                #c.last_size = len(c.text_data)
                #c.hourly_keywords.append(cluster_exporter.get_keywords(c, idfs)[:3])#['three','random','words']


                ## print quickly growing ones with high enough entropy
                ##if growth_rate < 10:
                #continue
                
                #entropy = cluster_exporter.calculate_cluster_entropy(c)
                #if entropy < ENTROPY_THRESHOLD:
                    #continue

                #print('Quickly growing cluster %d: %d tweets, %d tweets/h, entropy %.2f\n' % (id, len(c.text_data), int(growth_rate), entropy))
                #print('\n'.join(list(map(lambda x: x[0],random.sample(c.text_data, min(len(c.text), 8))))))
                #print('\n\n')
        
    # Every line in the input file should start with a timestamp in ms and id of document,
    # followed by the whole document, all separated with spaces and without newlines.
    #
    # Note: minimum word frequency is often implemented by the vector model already
    def construct_clusters(self, filename, from_line=0, from_date=None, to_date=None,idfs=None, lang=None):
        
        self.start_time = time.time()
        
        
        if lang != 'ru' and lang != 'fi':
            print("Lang must be 'ru' or 'fi'")
            return

        tweet_file = open(filename)

        try:
            self.line = 0
            

            # performance counter
            self.last_print_line = 0
            self.last_print_time = time.time()

            # used for calculating hourly growth in tweet time
            self.last_growth_calc = 0
            self.tweet_time = 0
            
            self.tweet_time_notz = datetime.utcfromtimestamp(0)
                
            for twlineesc in tweet_file:
                if  time.time() - self.start_time > self.max_runtime:
                    self.overrun = True
                    break
                
                twline = twlineesc.decode('unicode-escape').encode('utf-8')
                if len(twline) < 2:
                    continue
                twsplit = twline.split(',')
                try:
                    unix_tweet_time =int(time.mktime(datetime.strptime(twsplit[0], '%a %b %d %X +0000 %Y').timetuple()) * 1000)
                except:
                    print (twline)
                    print (twsplit[0])
                    raise Exception()
                tweet = " ".join([str(unix_tweet_time),twsplit[1],twsplit[4]])

                self.line += 1
                
                if self.line < from_line:
                    continue
                               
                               
                if self.tuneClusters:
                    if self.line % self.TUNE_INTERVAL == 0:
                        #pr.disable()
                        self.tuneClustersCall()
                        #pr.enable()
                    

                    
                # save periodically
                if False:#self.line % 1000000 == 0 and self.line != 0:
                    save_results(filename + '_' + str(self.line))    
                                   
                # remove really old clusters with a small amount of documents
                if self.line % 100000 == 0:
                    self.purgeClusters()
                    
                # print status
                if self.line % 1000 == 0:
                    #pr.disable()
                    new_time = time.time()
                    lps = int((self.line - self.last_print_line) / (new_time - self.last_print_time))
                    print("Line: %d, Date: %s, Clusters: %d, %d lines/s AVG candidates: %d" % (self.line, self.tweet_time_notz, len(self.clusters), lps, int(self.ncnttot/(self.ncntq+0.0000001))))
                    #if  int((self.line - self.last_print_line) / (new_time - self.last_print_time)) < 50:
                    #    s = StringIO.StringIO()
                    #    sortby = 'cumulative'
                    #    ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                    #    ps.print_stats()
                    #    print (s.getvalue())
                    self.last_print_line = self.line
                    self.last_print_time = new_time
                    self.ncnttot = 0
                    self.ncntq = 0
                    if  time.time() - self.start_time > self.max_runtime or lps < self.min_lines_per_second:
                        self.overrun = True
                        break
                    #pr.enable()


                # calculate growth rate
                #self.time_since_last_growth = self.tweet_time - self.last_growth_calc
                #if self.time_since_last_growth > 1000 * 60 * 60:
                #    self.last_growth_calc = self.tweet_time
                #    self.calcGrowthRate()
                

                self.tweet_time = unix_tweet_time
                tweet_parts = tweet.strip().split(' ')
                #try:
                #    self.tweet_time  = int(tweet_parts[0])
                #except ValueError:
                #    print('Invalid document on line %d: %s' % (self.line, tweet))
                #    continue
                
                self.tweet_time_notz = datetime.utcfromtimestamp(self.tweet_time * 0.001)
                tweet_time_utc = utc.localize(self.tweet_time_notz)
                
                if from_date is not None and tweet_time_utc < from_date:
                    continue
                    
                if to_date is not None and tweet_time_utc > to_date:
                    break
                    
                # TEMP ignore gameinsight spam and short tweets
                if len(tweet_parts) < 6 or tweet.find('gameinsight') != -1:
                    continue


                # allocate tweet to cluster
                doc_vec = document_to_vector(tweet_parts[2:], idfs)

                if doc_vec is None:
                    continue
                
                keywords = list(filter(lambda x: len(x) > 4, tweet.strip().split(' ')[2:]))
                
                #ignore short tweets
                if len(keywords) < 6:
                    continue
                    
                lowest_index = self.lookupNearest(doc_vec, keywords, similarity=True)
                
                if lowest_index != -1:
                    c = self.clusters[lowest_index]
 
                    c.appendTweet(doc_vec, [[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time], self.line)
                    #c.documents.append(doc_vec)
                    #c.text_data.append([[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time])
                    #c.last_update = self.line


                    # update the cluster center if the cluster is small
                    if len(c.documents) > 0:
                        if len(c.documents) < 5:
                            self.lsh_engine.delete_vector(lowest_index)

                            c.center = np.mean(c.documents, axis=0)
                            c.norm   = np.linalg.norm(c.center)

                            self.lsh_engine.store_vector(c.center, lowest_index)
                        else:
                            if len(c.documents) < 100:
                                c.power = np.mean(np.std(c.documents, axis=0))
                else:
                    # no cluster found, construct new one
                    self.initNewCluster([doc_vec], [[tweet.strip(), twsplit[3], twsplit[2]]], self.line, self.tweet_time, lang,[self.tweet_time])
            
        except KeyboardInterrupt:
            print("Line: %d Clusters: %d" % (self.line, len(self.clusters)))
            print("Cancelled")
        self.p.close()
        self.p.join()
Exemplo n.º 5
0
def example2():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 20000

    ##########################################################

    print 'Performing indexing with HashPermutations...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 14)
    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(DIM,
                         lshashes=[permutations],
                         distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm.store_vector(v)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1 - t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1, DIM))
    dists = CosineDistance().distance_matrix(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with HashPermutationMapper...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine
    engine_perm2 = Engine(DIM,
                          lshashes=[permutations2],
                          distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm2.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1 - t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutationMapper:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1, DIM))
    dists = CosineDistance().distance_matrix(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with mutliple binary hashes...'
    t0 = time.time()

    hashes = []
    for k in range(20):
        hashes.append(RandomBinaryProjections('rbp_%d' % k, 10))

    # Create engine
    engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS, DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_rbps.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1 - t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with mutliple binary hashes:'
    print '  -> Candidate count is %d' % engine_rbps.candidate_count(query)
    results = engine_rbps.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1, DIM))
    dists = CosineDistance().distance_matrix(matrix, query)
    dists = dists.reshape((-1, ))
    dists = sorted(dists)
    print dists[:10]
Exemplo n.º 6
0
def worker(A, start):
    # starttime=datetime.datetime.now()
    # endtime=datetime.datetime.now()
    # timee=endtime-starttime
    timee = 0
    num = 0
    for j in xrange(kkkk):
        k1 = 0
        #for circ in range(1000):
        # starttime=datetime.datetime.now()
        #lshtruple=lsh.query(newcomparearrt[j+start*kkkk],1)
        #print type(engine)
        lshtruple = engine.neighbours(newcomparearrt[j + start * kkkk])
        #print lshtruple
        # endtime=datetime.datetime.now()
        # timee=timee+(endtime-starttime).seconds
        # if lshtruple:
        #     print lshtruple[0]

        for f in xrange(len(CC)):
            #print CC[f]
            if lshtruple:
                if (tuple(CC[f]).__eq__(lshtruple[0][0])):
                    k1 = f
                    break

        #print k1
        length3 = len(clusresult[k1])
        temp = clusresult[k1]
        #.....................................................................................
        # lsh1=LSHash(6,3)
        # #print temp
        # ff=0
        # for ff in xrange(length3):
        #     lsh1.index(temp[ff])
        # starttime1=datetime.datetime.now()
        # if lsh1.query(newcomparearrt[j],1):
        #     num=num+1
        # endtime1=datetime.datetime.now()
        # timee=timee+(endtime1-starttime1).seconds
        # del lsh1
        #.....................................................................................
        #nearpy
        rbp1 = RandomBinaryProjections('rbp2', 10)
        DIM1 = 3
        engine1 = Engine(DIM1,
                         lshashes=[rbp1],
                         distance=CosineDistance(),
                         vector_filters=[NearestFilter(1)])
        for ff in xrange(length3):
            engine1.store_vector(temp[ff], ff)
        if engine1.candidate_count(newcomparearrt[j]):
            num = num + 1
        #print num
        #del engine
        results = engine1.neighbours(newcomparearrt[j])
        #  print results
        del engine1

        #...........................................................................
    A.append(num)
Exemplo n.º 7
0
def example1():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 10000

    print 'Creating engines'

    # We want 12 projections, 20 results at least
    rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

    # Create engine 1
    engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

    # Create binary hash as child hash
    rbp = RandomBinaryProjections('rbp1', 20)

    # Create engine 2
    engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 20)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine 3
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine 3
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    print 'Indexing %d random vectors of dimension %d' % (POINTS, DIM)

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine.store_vector(v)
        engine_rbpt.store_vector(v)
        engine_perm.store_vector(v)
        engine_perm2.store_vector(v)

    print 'Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys())
    print 'Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys())

    print 'Building permuted index for HashPermutations'

    # Then update permuted index
    permutations.build_permuted_index()

    print 'Generate random data'

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 1
    print '\nNeighbour distances with RandomBinaryProjectionTree:'
    print '  -> Candidate count is %d' % engine_rbpt.candidate_count(query)
    results = engine_rbpt.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 2
    print '\nNeighbour distances with RandomBinaryProjections:'
    print '  -> Candidate count is %d' % engine.candidate_count(query)
    results = engine.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutations2:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]
Exemplo n.º 8
0
class TestRandomBinaryProjectionTree(unittest.TestCase):
    def setUp(self):
        self.memory = MemoryStorage()
        self.redis_object = Redis(host='localhost', port=6379, db=0)
        self.redis_storage = RedisStorage(self.redis_object)

    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            print "Candidate count = %d" % self.engine.candidate_count(x)
            print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)

    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])

    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(
            self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Exemplo n.º 9
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num,
                                         matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension,
                             lshashes=[permutations2],
                             distance=CosineDistance(),
                             vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)

    def query(self, person_list):
        dists = []
        scores = []
        for person in person_list:
            query = map(float, self.face_feature[person].split(','))
            print '\nNeighbour distances with mutliple binary hashes:'
            print '  -> Candidate count is %d' % self.engine.candidate_count(
                query)
            results = self.engine.neighbours(query)
            dists = dists + [x[1] for x in results]
            scores = scores + [x[2] for x in results]
        t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists]
        res = zip(dists, scores, t_num)
        res.sort(key=lambda t: t[1])
        res1 = self.f7(res, person_list)
        return res1[:self.neighbour]

    def true_num(self, person):
        return self.ground_truth[person]

    def f7(self, zip_seq, person_list):
        seen = set()
        seen_add = seen.add
        return [
            x for x in zip_seq
            if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)
        ]
Exemplo n.º 10
0
class TestRandomBinaryProjectionTree(unittest.TestCase):

    def setUp(self):
        self.memory = MemoryStorage()
        self.redis_object = Redis(host='localhost',
                                  port=6379, db=0)
        self.redis_storage = RedisStorage(self.redis_object)

    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            print "Candidate count = %d" % self.engine.candidate_count(x)
            print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)

    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])

    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)


        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])