예제 #1
0
 def test_delete_vector_multiple_hash(self):
     hashes = [UniBucket('name_hash_%d' % k) for k in range(10)]
     engine = Engine(self.dim, lshashes=hashes)
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value)
     self.check_delete(engine)
예제 #2
0
 def test_delete_vector_with_provided_value(self):
     engine = Engine(self.dim, lshashes=[UniBucket('testHash')])
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value, self.removed_vector)
     self.check_delete(engine)
예제 #3
0
 def test_delete_vector_single_hash(self):
     engine = Engine(self.dim, lshashes=[UniBucket('testHash')])
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value)
     self.check_delete(engine)
예제 #4
0
class ClusterAnalyser:
    def __init__(self):
       from multiprocessing import Pool
       self.resetClusters()
       self.TUNE_INTERVAL = 1000
       self.ncnttot=0
       self.ncntq=0
       self.store_cnt = 20
       self.store_join_cnt = 20
       self.p = Pool(20)
       self.entropyLikelyhood = True
       self.tuneClusters = True
       self.cutters = [[0,None,'getRandomContractionsMinCut']]
       self.simgraphparams = dict(usedropout=False)
       self.max_runtime = 1200
       self.start_time = None
       self.overrun = False
       self.min_lines_per_second = 20
        
    def resetClusters(self):
        # Every new cluster gets an unique id which is the key for this dictionary
        self.clusters = {}
        self.next_cluster_id = FI_CLUSTER_ID_OFFSET if opt_lang == 'fi' else 0
        # Locality Sensitive Hashing
        self.lsh_engine = Engine(vecs.dim, lshashes=[RandomBinaryProjections('rpb', HYPERPLANE_COUNT) for i in range(HASH_LAYERS)], distance=lsh_distance_func)

    # Returns closest clusters to a given sentence, in a sorted list of (distance, cluster) tuples.
    def query_clusters(query, idfs):

        doc_vec = document_to_vector(query.split(' '), idfs)

        if doc_vec is None:
            return None

        return sorted([(1 - doc_vec.dot(c.center) / c.norm, c) for id, c in self.clusters.iteritems()])


    # look for nearest cluster
    def lookupNearest(self, doc_vec, keywords=None, similarity=None ):
        lowest_index = -1
        ncnt = self.lsh_engine.candidate_count(doc_vec)
        self.ncnttot += ncnt
        self.ncntq += 1
        
        if not similarity is None:
            nearest_neighbours = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec)))
        else:
            nearest_neighbours = list(filter(lambda x: filterKeywords(self.clusters[x[1]], keywords), self.lsh_engine.neighbours(doc_vec)))
        #nearest_neighbours = self.lsh_engine.neighbours(doc_vec)

        
        if len(nearest_neighbours) > 0:
            # get closest one from tuple (cluster vector, cluster index, distance)
            nn = min(nearest_neighbours, key=lambda x: (x[2]/self.clusters[x[1]].power))

            #if nn[2] < (CLUSTER_THRESHOLD*self.clusters[nn[1]].power):
            lowest_index = nn[1]
        return lowest_index
        




    def initNewCluster(self, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time):
        c = makeNewCluster(self.next_cluster_id, doc_vec, tweet_data, line, tweet_time, lang, tweet_post_time)
        self.addCluster(c)
        
    def addCluster(self, c):
        self.lsh_engine.store_vector(c.center, self.next_cluster_id)
        self.clusters[self.next_cluster_id] = c
        self.next_cluster_id += 1        
 


    def tuneClustersCall(self):
        line = self.line
        deleted_clusters = []
        print ('parallel preprocessing ... ')
        #parallel preprocessing 
        dlist = list(self.clusters.iteritems())
        params = [[self.line - self.TUNE_INTERVAL, self.entropyLikelyhood, self.cutters, self.simgraphparams]]*len(dlist)
        split_test_out = dict(self.p.map(doAnalyseSplit, zip(dlist, params)))
        

        print ('done')
        for c_idx, c in list(self.clusters.iteritems()): 
            if c_idx in deleted_clusters:
                continue
            #print ([c_idx, c])        
            if c.last_update > line - self.TUNE_INTERVAL:
                if len(c.documents) > 10:  
                    
                    if split_test_out[c_idx]['result']:
                        a = split_test_out[c_idx]['a']
                        b = split_test_out[c_idx]['b']
                        probJoin = split_test_out[c_idx]['probJoin']
                        probSplit = split_test_out[c_idx]['probSplit']
                        c.documents = list(map(lambda x: x[0],a))
                        c.text_data = list(map(lambda x: x[1],a))
                        c.word_index = dict()
                        for t in c.text_data:
                            for w in list(filter(lambda x: len(x) > 3, t[0][0].split(' ')[2:])):
                                c.word_index[w] = ''
                        
                        self.lsh_engine.delete_vector(c_idx)
                        c.center = np.mean(c.documents, axis=0)
                        c.norm   = np.linalg.norm(c.center)
                        c.updatePower()
                        self.lsh_engine.store_vector(c.center, c_idx)
                        # copy time parameters for now
                        print ("Split cluster %d into %d and %d  %f < %f" % (c_idx, len(a), len(b), probJoin, probSplit))
                        self.initNewCluster(list(map(lambda x: x[0],b)), list(map(lambda x: x[1][0],b)), c.last_update, c.created_at, c.lang, list(map(lambda x: x[1][1],b)))
                        if self.store_cnt > 0:
                            pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_split_cases_%d.pckl'%self.store_cnt,'wb'))
                            self.store_cnt -= 1
                        
            if len(c.documents) > 30:      
                # Test merge with random nearest
                nearest_neighbour_clusters = list(filter(lambda x: filterKeywordSimilarity(self.clusters[x[1]], c.word_index), self.lsh_engine.neighbours(c.center)))#self.lsh_engine.neighbours(c.center)
                nearest_neighbour_clusters.sort(key=lambda x: x[2])
                maxrnd = min(len(nearest_neighbour_clusters),6)
                if len(nearest_neighbour_clusters) > 1:
                    ann, bnn = random.sample(nearest_neighbour_clusters[:maxrnd], 2)
                    
                    a= zip(self.clusters[ann[1]].documents, self.clusters[ann[1]].text_data)
                    b= zip(self.clusters[bnn[1]].documents, self.clusters[bnn[1]].text_data)
                    if len(a) < 20 and (not self.entropyLikelyhood): #or len(a) > 500 :
                        continue
                    if len(b) < 20 and (not self.entropyLikelyhood):  #or len(b) > 500 :
                        continue
                    if self.clusters[ann[1]].lang != self.clusters[bnn[1]].lang:
                        continue
                        
                    if self.entropyLikelyhood:
                        c = makeNewCluster(self.next_cluster_id, list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b)))
                        probJoin = computeEntropyLikelyhood(c, idfs)
                        wa = len(a)/(float(len(a))+len(b))
                        probSplit = (wa*computeEntropyLikelyhood(self.clusters[ann[1]],idfs)+(1-wa)*computeEntropyLikelyhood(self.clusters[bnn[1]],idfs))+(wa*math.log(wa)/math.log(2)+(1-wa)*math.log((1-wa))/math.log(2))+random.random()                        
                    else:
                        probJoin = computeNormalLikelyhood(a+b)
                        probSplit = computeNormalLikelyhood(a)*computeNormalLikelyhood(b)
                    if probJoin > probSplit:
                         deleted_clusters.append(ann[1])
                         deleted_clusters.append(bnn[1])
                         print ("Join clusters %d (%d) and %d (%d) %f > %f" % (ann[1], len(a), bnn[1], len(b), probJoin, probSplit))
                         if self.store_join_cnt > 0:
                             pickle.dump(dict(a=a,b=b,probJoin=probJoin,probSplit=probSplit),open('stored_join_cases_%d.pckl'%self.store_join_cnt,'wb'))
                             self.store_join_cnt -= 1
                         if self.entropyLikelyhood:
                             self.addCluster(c)
                         else:
                             self.initNewCluster(list(map(lambda x: x[0],a+b)), list(map(lambda x: x[1][0],a+b)), max(self.clusters[bnn[1]].last_update,self.clusters[ann[1]].last_update), max(self.clusters[bnn[1]].created_at,self.clusters[ann[1]].created_at), self.clusters[ann[1]].lang, list(map(lambda x: x[1][1],a+b)))
                         self.lsh_engine.delete_vector(ann[1])
                         self.clusters.pop(ann[1])
                         self.lsh_engine.delete_vector(bnn[1])
                         self.clusters.pop(bnn[1])
                                 
                                 
                  
                                
    def purgeClusters(self):
        line = self.line
        to_be_removed = []
        for k, c in self.clusters.iteritems():
            if line - c.last_update > (100000 * len(c.documents)) and len(c.documents) < 10:
                to_be_removed.append((k, c.center))

        for t in to_be_removed:
            self.lsh_engine.delete_vector(t[0])
            self.clusters.pop(t[0])

        if len(to_be_removed) > 0:
            print("Removed %d stagnant clusters" % len(to_be_removed))

    def calcGrowthRate(self):
        line = self.line
        tweet_time = self.tweet_time
        time_since_last_growth = self.time_since_last_growth
        for id, c in self.clusters.iteritems():
            #if (c.created_at < 1405555200000): # 17/07/2014 00:00:00
            #    continue

            c.calculateGrowthAndSentiment()
            
            ## calculate growth for first 12h
            #if len(c.hourly_growth_rate) < 12:
                #growth_rate = (len(c.text_data) - c.last_size) / float(time_since_last_growth) * 1000 * 60 * 60
                #if len(c.hourly_growth_rate) == 0:
                    #c.first_growth_time = tweet_time

                #c.hourly_growth_rate.append(growth_rate)

                ## calculate sentiment for new tweets
                #if len(c.documents) > c.last_size:
                    #cluster_vector = np.mean(c.documents[c.last_size:], axis=0)
                    #sentiment = getSentiment(cluster_vector)
                #else:
                    #sentiment = 0

                #c.hourly_sentiment.append(sentiment)

                ## calculate total sentiment so far
                #sentiment = getSentiment(np.mean(c.documents, axis=0))
                #c.hourly_accum_sentiment.append(sentiment)

                #c.last_size = len(c.text_data)
                #c.hourly_keywords.append(cluster_exporter.get_keywords(c, idfs)[:3])#['three','random','words']


                ## print quickly growing ones with high enough entropy
                ##if growth_rate < 10:
                #continue
                
                #entropy = cluster_exporter.calculate_cluster_entropy(c)
                #if entropy < ENTROPY_THRESHOLD:
                    #continue

                #print('Quickly growing cluster %d: %d tweets, %d tweets/h, entropy %.2f\n' % (id, len(c.text_data), int(growth_rate), entropy))
                #print('\n'.join(list(map(lambda x: x[0],random.sample(c.text_data, min(len(c.text), 8))))))
                #print('\n\n')
        
    # Every line in the input file should start with a timestamp in ms and id of document,
    # followed by the whole document, all separated with spaces and without newlines.
    #
    # Note: minimum word frequency is often implemented by the vector model already
    def construct_clusters(self, filename, from_line=0, from_date=None, to_date=None,idfs=None, lang=None):
        
        self.start_time = time.time()
        
        
        if lang != 'ru' and lang != 'fi':
            print("Lang must be 'ru' or 'fi'")
            return

        tweet_file = open(filename)

        try:
            self.line = 0
            

            # performance counter
            self.last_print_line = 0
            self.last_print_time = time.time()

            # used for calculating hourly growth in tweet time
            self.last_growth_calc = 0
            self.tweet_time = 0
            
            self.tweet_time_notz = datetime.utcfromtimestamp(0)
                
            for twlineesc in tweet_file:
                if  time.time() - self.start_time > self.max_runtime:
                    self.overrun = True
                    break
                
                twline = twlineesc.decode('unicode-escape').encode('utf-8')
                if len(twline) < 2:
                    continue
                twsplit = twline.split(',')
                try:
                    unix_tweet_time =int(time.mktime(datetime.strptime(twsplit[0], '%a %b %d %X +0000 %Y').timetuple()) * 1000)
                except:
                    print (twline)
                    print (twsplit[0])
                    raise Exception()
                tweet = " ".join([str(unix_tweet_time),twsplit[1],twsplit[4]])

                self.line += 1
                
                if self.line < from_line:
                    continue
                               
                               
                if self.tuneClusters:
                    if self.line % self.TUNE_INTERVAL == 0:
                        #pr.disable()
                        self.tuneClustersCall()
                        #pr.enable()
                    

                    
                # save periodically
                if False:#self.line % 1000000 == 0 and self.line != 0:
                    save_results(filename + '_' + str(self.line))    
                                   
                # remove really old clusters with a small amount of documents
                if self.line % 100000 == 0:
                    self.purgeClusters()
                    
                # print status
                if self.line % 1000 == 0:
                    #pr.disable()
                    new_time = time.time()
                    lps = int((self.line - self.last_print_line) / (new_time - self.last_print_time))
                    print("Line: %d, Date: %s, Clusters: %d, %d lines/s AVG candidates: %d" % (self.line, self.tweet_time_notz, len(self.clusters), lps, int(self.ncnttot/(self.ncntq+0.0000001))))
                    #if  int((self.line - self.last_print_line) / (new_time - self.last_print_time)) < 50:
                    #    s = StringIO.StringIO()
                    #    sortby = 'cumulative'
                    #    ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                    #    ps.print_stats()
                    #    print (s.getvalue())
                    self.last_print_line = self.line
                    self.last_print_time = new_time
                    self.ncnttot = 0
                    self.ncntq = 0
                    if  time.time() - self.start_time > self.max_runtime or lps < self.min_lines_per_second:
                        self.overrun = True
                        break
                    #pr.enable()


                # calculate growth rate
                #self.time_since_last_growth = self.tweet_time - self.last_growth_calc
                #if self.time_since_last_growth > 1000 * 60 * 60:
                #    self.last_growth_calc = self.tweet_time
                #    self.calcGrowthRate()
                

                self.tweet_time = unix_tweet_time
                tweet_parts = tweet.strip().split(' ')
                #try:
                #    self.tweet_time  = int(tweet_parts[0])
                #except ValueError:
                #    print('Invalid document on line %d: %s' % (self.line, tweet))
                #    continue
                
                self.tweet_time_notz = datetime.utcfromtimestamp(self.tweet_time * 0.001)
                tweet_time_utc = utc.localize(self.tweet_time_notz)
                
                if from_date is not None and tweet_time_utc < from_date:
                    continue
                    
                if to_date is not None and tweet_time_utc > to_date:
                    break
                    
                # TEMP ignore gameinsight spam and short tweets
                if len(tweet_parts) < 6 or tweet.find('gameinsight') != -1:
                    continue


                # allocate tweet to cluster
                doc_vec = document_to_vector(tweet_parts[2:], idfs)

                if doc_vec is None:
                    continue
                
                keywords = list(filter(lambda x: len(x) > 4, tweet.strip().split(' ')[2:]))
                
                #ignore short tweets
                if len(keywords) < 6:
                    continue
                    
                lowest_index = self.lookupNearest(doc_vec, keywords, similarity=True)
                
                if lowest_index != -1:
                    c = self.clusters[lowest_index]
 
                    c.appendTweet(doc_vec, [[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time], self.line)
                    #c.documents.append(doc_vec)
                    #c.text_data.append([[tweet.strip(), twsplit[3], twsplit[2]], self.tweet_time])
                    #c.last_update = self.line


                    # update the cluster center if the cluster is small
                    if len(c.documents) > 0:
                        if len(c.documents) < 5:
                            self.lsh_engine.delete_vector(lowest_index)

                            c.center = np.mean(c.documents, axis=0)
                            c.norm   = np.linalg.norm(c.center)

                            self.lsh_engine.store_vector(c.center, lowest_index)
                        else:
                            if len(c.documents) < 100:
                                c.power = np.mean(np.std(c.documents, axis=0))
                else:
                    # no cluster found, construct new one
                    self.initNewCluster([doc_vec], [[tweet.strip(), twsplit[3], twsplit[2]]], self.line, self.tweet_time, lang,[self.tweet_time])
            
        except KeyboardInterrupt:
            print("Line: %d Clusters: %d" % (self.line, len(self.clusters)))
            print("Cancelled")
        self.p.close()
        self.p.join()
예제 #5
0
 def test_delete_vector_with_provided_value(self):
     engine = Engine(self.dim, lshashes=[UniBucket('testHash')])
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value, self.removed_vector)
     self.check_delete(engine)
예제 #6
0
 def test_delete_vector_multiple_hash(self):
     hashes = [UniBucket('name_hash_%d' % k) for k in range(10)]
     engine = Engine(self.dim, lshashes=hashes)
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value)
     self.check_delete(engine)
예제 #7
0
 def test_delete_vector_single_hash(self):
     engine = Engine(self.dim, lshashes=[UniBucket('testHash')])
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value)
     self.check_delete(engine)