예제 #1
0
 def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
     self.query = {}
     self.candidate = candidatePath
     self.tweet = {}
     self.mu = mu
     self.sigma = sigma  #similarity threshold
     self.lamda = lamda  #cluster threshold
     self.jaccInstance = Jaccard()
     self.klInstance = Distance(mu, corpusFile)
     print "corpus read done!"
예제 #2
0
파일: cluster.py 프로젝트: yaolili/ttg
 def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
     self.query = {}
     self.candidate = candidatePath
     self.tweet = {}  
     self.mu = mu
     self.sigma = sigma      #similarity threshold
     self.lamda = lamda      #cluster threshold     
     self.jaccInstance = Jaccard()
     self.klInstance = Distance(mu, corpusFile)
     print "corpus read done!"    
예제 #3
0
파일: cluster.py 프로젝트: yaolili/ttg
class Cluster:
    def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
        self.query = {}
        self.candidate = candidatePath
        self.tweet = {}  
        self.mu = mu
        self.sigma = sigma      #similarity threshold
        self.lamda = lamda      #cluster threshold     
        self.jaccInstance = Jaccard()
        self.klInstance = Distance(mu, corpusFile)
        print "corpus read done!"    
    
    def write(self, writePath, alpha, yibuson):
        writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson)
        result = open(writeFile, "w+")
        log = open(writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log.write("Qid\tclusterCount\ttweetCount\n")
        log1 = open(writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log2 = open(writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log3 = open(writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")

        num = 1
        files = []
        while(num <= 55):
            files.append(str(num) + ".res.content.all")
            num += 1
        for file in files:
            #remember to make them initial on each query
            self.curQid = -1
            self.cluster = []
            self.qidWidKL = {}
            self.qidWidMax = 0
            self.qidWidMin = 999
            self.widWidKL = {}
            self.widWidMax = 0
            self.widWidMin = 999 
            self.widScore = {}
            self.jacc = {}
            self.jaccMax = 0
            self.jaccMin = 1
            self.resultList = []
            readPath = self.candidate + file  
            
            with open(readPath, "r") as fin:
                for i, line in enumerate(fin):
                    qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip().split("\t")
                    self.query[qid] = qcontent
                    
                    #first time selection
                    if(float(score) < 4.59):
                        if not self.cluster:
                            print "break out of 4.59, ", file, " , empty cluster!"
                            exit()
                        break
                        
                    self.tweet[wid] = wcontent
                    self.curQid = qid
                    
                    #calculate qidWidKL
                    similarity = self.klInstance.kl(self.query[qid], wcontent)
                    
                    #calculate jaccard score
                    jaccScore = self.jaccInstance.jaccardScore(qcontent, wcontent)
                    
                    #if similarity <= self.sigma and jaccScore >= yibuson:
                    if similarity <= self.sigma:    
                        #set self.qidWidKL
                        self.qidWidKL[qid+"-"+wid] = similarity
                        if (similarity > self.qidWidMax):
                            self.qidWidMax = similarity

                        if (similarity < self.qidWidMin):
                            self.qidWidMin = similarity
                        
                        #set self.jacc       
                        self.jacc[qid+"-"+wid] = jaccScore
                        if self.jaccMax < jaccScore:
                            self.jaccMax = jaccScore
                        if self.jaccMin > jaccScore:
                            self.jaccMin = jaccScore    
                        
                        #calculate widWidKL
                        if not self.cluster:
                            self.cluster.append([wid])
                            self.widWidKL[wid] = []
                        else:
                            self.__clustering(wid)

                    
                    if (i % 100) == 0:
                        print file, " => ", i
                
            log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n")
            for key in self.qidWidKL:
                log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n")
            log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n")
            for key in self.widWidKL:
                for i in range(len(self.widWidKL[key])):
                    for widKey in self.widWidKL[key][i]:
                        log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n")
            log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n")
            for key in self.jacc:
                log3.write(key + "\t" + str(self.jacc[key]) + "\n")
                        
            rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax)
            #self.widScore = rankInstance.textRank()
            self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet)
            
            #log info
            clusterCount = len(self.cluster)
            tweetCount = 0
            
            #select one wid from each cluster 
            for i in range(len(self.cluster)):
                maxScore = 0
                bestWid = -1
                tweetCount += len(self.cluster[i])
                for wid in self.cluster[i]:
                    if self.widScore[wid] > maxScore:
                    #select min query-tweet kl score
                    #key = str(self.curQid) + "-" + wid
                    #if self.qidWidKL[key] > maxScore:
                        #maxScore = self.qidWidKL[key]
                        maxScore = self.widScore[wid]
                        bestWid = wid
                self.resultList.append(bestWid)
            
            #write log info
            log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n")
            
                       
            #write result
            for wid in self.resultList:
                result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n")
            
        
    def __clustering(self, wid):
        minScore = 999
        index = -1
        wcontent = self.tweet[wid]
        for i in range(len(self.cluster)):          
            for cwid in self.cluster[i]:
                ccontent = self.tweet[cwid]
                score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid])
                #print i
                if wid in self.widWidKL:
                    self.widWidKL[wid].append({cwid: score})
                else:
                    self.widWidKL[wid] = [{cwid: score}]
                
                if cwid in self.widWidKL:
                    self.widWidKL[cwid].append({wid: score})
                else:
                    self.widWidKL[cwid] = [{wid: score}]
                    
                    
                #select miniScore, that is the most similar value
                if score < minScore:
                    minScore = score
                    index = i
                #record self.widWidMax & self.widWidMin
                if score < self.widWidMin:
                    self.widWidMin = score

                if score > self.widWidMax:
                    self.widWidMax = score
                    
        #put wid into the cluster 
        #a new cluster
        if minScore > self.lamda:
            self.cluster.append([wid])
            #print self.cluster
        #add to a highest similarity cluster
        else:
            self.cluster[index].append(wid)
예제 #4
0
class Cluster:
    def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda):
        self.query = {}
        self.candidate = candidatePath
        self.tweet = {}
        self.mu = mu
        self.sigma = sigma  #similarity threshold
        self.lamda = lamda  #cluster threshold
        self.jaccInstance = Jaccard()
        self.klInstance = Distance(mu, corpusFile)
        print "corpus read done!"

    def write(self, writePath, alpha, yibuson):
        writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(
            self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson)
        result = open(writeFile, "w+")
        log = open(
            writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log.write("Qid\tclusterCount\ttweetCount\n")
        log1 = open(
            writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log2 = open(
            writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")
        log3 = open(
            writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" +
            str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+")

        num = 1
        files = []
        while (num <= 55):
            files.append(str(num) + ".res.content.all")
            num += 1
        for file in files:
            #remember to make them initial on each query
            self.curQid = -1
            self.cluster = []
            self.qidWidKL = {}
            self.qidWidMax = 0
            self.qidWidMin = 999
            self.widWidKL = {}
            self.widWidMax = 0
            self.widWidMin = 999
            self.widScore = {}
            self.jacc = {}
            self.jaccMax = 0
            self.jaccMin = 1
            self.resultList = []
            readPath = self.candidate + file

            with open(readPath, "r") as fin:
                for i, line in enumerate(fin):
                    qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip(
                    ).split("\t")
                    self.query[qid] = qcontent

                    #first time selection
                    if (float(score) < 4.59):
                        if not self.cluster:
                            print "break out of 4.59, ", file, " , empty cluster!"
                            exit()
                        break

                    self.tweet[wid] = wcontent
                    self.curQid = qid

                    #calculate qidWidKL
                    similarity = self.klInstance.kl(self.query[qid], wcontent)

                    #calculate jaccard score
                    jaccScore = self.jaccInstance.jaccardScore(
                        qcontent, wcontent)

                    #if similarity <= self.sigma and jaccScore >= yibuson:
                    if similarity <= self.sigma:
                        #set self.qidWidKL
                        self.qidWidKL[qid + "-" + wid] = similarity
                        if (similarity > self.qidWidMax):
                            self.qidWidMax = similarity

                        if (similarity < self.qidWidMin):
                            self.qidWidMin = similarity

                        #set self.jacc
                        self.jacc[qid + "-" + wid] = jaccScore
                        if self.jaccMax < jaccScore:
                            self.jaccMax = jaccScore
                        if self.jaccMin > jaccScore:
                            self.jaccMin = jaccScore

                        #calculate widWidKL
                        if not self.cluster:
                            self.cluster.append([wid])
                            self.widWidKL[wid] = []
                        else:
                            self.__clustering(wid)

                    if (i % 100) == 0:
                        print file, " => ", i

            log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n")
            for key in self.qidWidKL:
                log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n")
            log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n")
            for key in self.widWidKL:
                for i in range(len(self.widWidKL[key])):
                    for widKey in self.widWidKL[key][i]:
                        log2.write(key + "-" + widKey + "\t" +
                                   str(self.widWidKL[key][i][widKey]) + "\n")
            log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n")
            for key in self.jacc:
                log3.write(key + "\t" + str(self.jacc[key]) + "\n")

            rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax)
            #self.widScore = rankInstance.textRank()
            self.widScore = rankInstance.combinedCov(alpha, self.cluster,
                                                     self.tweet)

            #log info
            clusterCount = len(self.cluster)
            tweetCount = 0

            #select one wid from each cluster
            for i in range(len(self.cluster)):
                maxScore = 0
                bestWid = -1
                tweetCount += len(self.cluster[i])
                for wid in self.cluster[i]:
                    if self.widScore[wid] > maxScore:
                        #select min query-tweet kl score
                        #key = str(self.curQid) + "-" + wid
                        #if self.qidWidKL[key] > maxScore:
                        #maxScore = self.qidWidKL[key]
                        maxScore = self.widScore[wid]
                        bestWid = wid
                self.resultList.append(bestWid)

            #write log info
            log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" +
                      str(tweetCount) + "\n")

            #write result
            for wid in self.resultList:
                result.write("MB" + self.curQid + "\t" + "Q0\t" + wid +
                             "\t1\t1\tYAO\n")

    def __clustering(self, wid):
        minScore = 999
        index = -1
        wcontent = self.tweet[wid]
        for i in range(len(self.cluster)):
            for cwid in self.cluster[i]:
                ccontent = self.tweet[cwid]
                score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid])
                #print i
                if wid in self.widWidKL:
                    self.widWidKL[wid].append({cwid: score})
                else:
                    self.widWidKL[wid] = [{cwid: score}]

                if cwid in self.widWidKL:
                    self.widWidKL[cwid].append({wid: score})
                else:
                    self.widWidKL[cwid] = [{wid: score}]

                #select miniScore, that is the most similar value
                if score < minScore:
                    minScore = score
                    index = i
                #record self.widWidMax & self.widWidMin
                if score < self.widWidMin:
                    self.widWidMin = score

                if score > self.widWidMax:
                    self.widWidMax = score

        #put wid into the cluster
        #a new cluster
        if minScore > self.lamda:
            self.cluster.append([wid])
            #print self.cluster
        #add to a highest similarity cluster
        else:
            self.cluster[index].append(wid)