def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!"
class Cluster: def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!" def write(self, writePath, alpha, yibuson): writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson) result = open(writeFile, "w+") log = open(writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log.write("Qid\tclusterCount\ttweetCount\n") log1 = open(writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log2 = open(writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log3 = open(writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") num = 1 files = [] while(num <= 55): files.append(str(num) + ".res.content.all") num += 1 for file in files: #remember to make them initial on each query self.curQid = -1 self.cluster = [] self.qidWidKL = {} self.qidWidMax = 0 self.qidWidMin = 999 self.widWidKL = {} self.widWidMax = 0 self.widWidMin = 999 self.widScore = {} self.jacc = {} self.jaccMax = 0 self.jaccMin = 1 self.resultList = [] readPath = self.candidate + file with open(readPath, "r") as fin: for i, line in enumerate(fin): qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip().split("\t") self.query[qid] = qcontent #first time selection if(float(score) < 4.59): if not self.cluster: print "break out of 4.59, ", file, " , empty cluster!" exit() break self.tweet[wid] = wcontent self.curQid = qid #calculate qidWidKL similarity = self.klInstance.kl(self.query[qid], wcontent) #calculate jaccard score jaccScore = self.jaccInstance.jaccardScore(qcontent, wcontent) #if similarity <= self.sigma and jaccScore >= yibuson: if similarity <= self.sigma: #set self.qidWidKL self.qidWidKL[qid+"-"+wid] = similarity if (similarity > self.qidWidMax): self.qidWidMax = similarity if (similarity < self.qidWidMin): self.qidWidMin = similarity #set self.jacc self.jacc[qid+"-"+wid] = jaccScore if self.jaccMax < jaccScore: self.jaccMax = jaccScore if self.jaccMin > jaccScore: self.jaccMin = jaccScore #calculate widWidKL if not self.cluster: self.cluster.append([wid]) self.widWidKL[wid] = [] else: self.__clustering(wid) if (i % 100) == 0: print file, " => ", i log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n") for key in self.qidWidKL: log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n") log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n") for key in self.widWidKL: for i in range(len(self.widWidKL[key])): for widKey in self.widWidKL[key][i]: log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n") log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n") for key in self.jacc: log3.write(key + "\t" + str(self.jacc[key]) + "\n") rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax) #self.widScore = rankInstance.textRank() self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet) #log info clusterCount = len(self.cluster) tweetCount = 0 #select one wid from each cluster for i in range(len(self.cluster)): maxScore = 0 bestWid = -1 tweetCount += len(self.cluster[i]) for wid in self.cluster[i]: if self.widScore[wid] > maxScore: #select min query-tweet kl score #key = str(self.curQid) + "-" + wid #if self.qidWidKL[key] > maxScore: #maxScore = self.qidWidKL[key] maxScore = self.widScore[wid] bestWid = wid self.resultList.append(bestWid) #write log info log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n") #write result for wid in self.resultList: result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n") def __clustering(self, wid): minScore = 999 index = -1 wcontent = self.tweet[wid] for i in range(len(self.cluster)): for cwid in self.cluster[i]: ccontent = self.tweet[cwid] score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid]) #print i if wid in self.widWidKL: self.widWidKL[wid].append({cwid: score}) else: self.widWidKL[wid] = [{cwid: score}] if cwid in self.widWidKL: self.widWidKL[cwid].append({wid: score}) else: self.widWidKL[cwid] = [{wid: score}] #select miniScore, that is the most similar value if score < minScore: minScore = score index = i #record self.widWidMax & self.widWidMin if score < self.widWidMin: self.widWidMin = score if score > self.widWidMax: self.widWidMax = score #put wid into the cluster #a new cluster if minScore > self.lamda: self.cluster.append([wid]) #print self.cluster #add to a highest similarity cluster else: self.cluster[index].append(wid)
class Cluster: def __init__(self, queryFile, candidatePath, mu, corpusFile, sigma, lamda): self.query = {} self.candidate = candidatePath self.tweet = {} self.mu = mu self.sigma = sigma #similarity threshold self.lamda = lamda #cluster threshold self.jaccInstance = Jaccard() self.klInstance = Distance(mu, corpusFile) print "corpus read done!" def write(self, writePath, alpha, yibuson): writeFile = writePath + "res.rmSW4.59S" + str(self.sigma) + ".L" + str( self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson) result = open(writeFile, "w+") log = open( writePath + "log.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log.write("Qid\tclusterCount\ttweetCount\n") log1 = open( writePath + "qidWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log2 = open( writePath + "widWidKL.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") log3 = open( writePath + "jaccScore.rmSW4.59S" + str(self.sigma) + ".L" + str(self.lamda) + ".A" + str(alpha) + ".NJ" + str(yibuson), "w+") num = 1 files = [] while (num <= 55): files.append(str(num) + ".res.content.all") num += 1 for file in files: #remember to make them initial on each query self.curQid = -1 self.cluster = [] self.qidWidKL = {} self.qidWidMax = 0 self.qidWidMin = 999 self.widWidKL = {} self.widWidMax = 0 self.widWidMin = 999 self.widScore = {} self.jacc = {} self.jaccMax = 0 self.jaccMin = 1 self.resultList = [] readPath = self.candidate + file with open(readPath, "r") as fin: for i, line in enumerate(fin): qid, Qid, wid, rank, score, runName, wcontent, qcontent = line.strip( ).split("\t") self.query[qid] = qcontent #first time selection if (float(score) < 4.59): if not self.cluster: print "break out of 4.59, ", file, " , empty cluster!" exit() break self.tweet[wid] = wcontent self.curQid = qid #calculate qidWidKL similarity = self.klInstance.kl(self.query[qid], wcontent) #calculate jaccard score jaccScore = self.jaccInstance.jaccardScore( qcontent, wcontent) #if similarity <= self.sigma and jaccScore >= yibuson: if similarity <= self.sigma: #set self.qidWidKL self.qidWidKL[qid + "-" + wid] = similarity if (similarity > self.qidWidMax): self.qidWidMax = similarity if (similarity < self.qidWidMin): self.qidWidMin = similarity #set self.jacc self.jacc[qid + "-" + wid] = jaccScore if self.jaccMax < jaccScore: self.jaccMax = jaccScore if self.jaccMin > jaccScore: self.jaccMin = jaccScore #calculate widWidKL if not self.cluster: self.cluster.append([wid]) self.widWidKL[wid] = [] else: self.__clustering(wid) if (i % 100) == 0: print file, " => ", i log1.write(str(self.qidWidMin) + "\t" + str(self.qidWidMax) + "\n") for key in self.qidWidKL: log1.write(key + "\t" + str(self.qidWidKL[key]) + "\n") log2.write(str(self.widWidMin) + "\t" + str(self.widWidMax) + "\n") for key in self.widWidKL: for i in range(len(self.widWidKL[key])): for widKey in self.widWidKL[key][i]: log2.write(key + "-" + widKey + "\t" + str(self.widWidKL[key][i][widKey]) + "\n") log3.write(str(self.jaccMin) + "\t" + str(self.jaccMax) + "\n") for key in self.jacc: log3.write(key + "\t" + str(self.jacc[key]) + "\n") rankInstance = Rank(self.widWidKL, self.widWidMin, self.widWidMax) #self.widScore = rankInstance.textRank() self.widScore = rankInstance.combinedCov(alpha, self.cluster, self.tweet) #log info clusterCount = len(self.cluster) tweetCount = 0 #select one wid from each cluster for i in range(len(self.cluster)): maxScore = 0 bestWid = -1 tweetCount += len(self.cluster[i]) for wid in self.cluster[i]: if self.widScore[wid] > maxScore: #select min query-tweet kl score #key = str(self.curQid) + "-" + wid #if self.qidWidKL[key] > maxScore: #maxScore = self.qidWidKL[key] maxScore = self.widScore[wid] bestWid = wid self.resultList.append(bestWid) #write log info log.write("MB" + self.curQid + "\t" + str(clusterCount) + "\t" + str(tweetCount) + "\n") #write result for wid in self.resultList: result.write("MB" + self.curQid + "\t" + "Q0\t" + wid + "\t1\t1\tYAO\n") def __clustering(self, wid): minScore = 999 index = -1 wcontent = self.tweet[wid] for i in range(len(self.cluster)): for cwid in self.cluster[i]: ccontent = self.tweet[cwid] score = self.klInstance.kl(self.tweet[wid], self.tweet[cwid]) #print i if wid in self.widWidKL: self.widWidKL[wid].append({cwid: score}) else: self.widWidKL[wid] = [{cwid: score}] if cwid in self.widWidKL: self.widWidKL[cwid].append({wid: score}) else: self.widWidKL[cwid] = [{wid: score}] #select miniScore, that is the most similar value if score < minScore: minScore = score index = i #record self.widWidMax & self.widWidMin if score < self.widWidMin: self.widWidMin = score if score > self.widWidMax: self.widWidMax = score #put wid into the cluster #a new cluster if minScore > self.lamda: self.cluster.append([wid]) #print self.cluster #add to a highest similarity cluster else: self.cluster[index].append(wid)