def clusterAllWithKMeans(lowerLimit, upperLimit, featMan, weightMatrix,\ samePairsSet, differentPairsSet, outDir): metrics = {} print 'Weight matrix length' ,len(weightMatrix) data = featMan.returnKeys() for k in range(lowerLimit,upperLimit,2): i = len(data)/k if i == 0: i = 1 kmeans = KMeans(i,data,weightMatrix,5, 0.1) kmeans.cluster(); clusters = kmeans.getClusters(); noClus =kmeans.getTermInNoCluster(); if clusters: print 'Found clusters length', len(clusters),'singaltons', len(noClus) predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(clusters,featMan) fname = outDir+'_'+str(i)+'.txt' oFile = open(fname,'w'); for entry in clusters: if len(entry) > 0: oFile.write(toString(entry,featMan)+'\n') oFile.write('NO CLUST\t'+toString(noClus,featMan)+'\n'); oFile.close() print 'Same pair set', len(predictedSamePairsSet) #metrics[k] = getRecallPrecision(samePairsSet, \ #differentPairsSet, predictedSamePairsSet, predictedDifferentPairsSet) metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) for tcount, met in metrics.items(): print tcount, met return metrics
def clusterCatWithKMeans(lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist,\ outFile = 'cat-clusters-with-mean.txt'): metrics = {} for termCount in range(lowerLimit, upperLimit): i = 1 fclusters = [] allCatClusters = [] oFile = open(outFile+'_'+str(termCount)+'.txt','w') for cat, qIdSet in catQueryDist.items(): if len(qIdSet) > 1: k = len(qIdSet)/termCount if k == 0: k = 1 print cat, len(qIdSet), k if k > 1: kmeans = KMeans(k,list(qIdSet),weightMatrix,5, 0.1) kmeans.cluster(); clusters = kmeans.getClusters(); noClus =kmeans.getTermInNoCluster(); for entry in clusters: if len(entry) > 1: allCatClusters.append(entry) if len(entry) > 0: cStr = toString(entry,featMan) fclusters.append(cStr) oFile.write(cat+'\t'+cStr+'\n'); oFile.write(cat+'\t'+'NO CLUST\t'+\ toString(noClus,featMan)+'\n'); else: cStr = toString(qIdSet,featMan) oFile.write(cat+'\t'+cStr+'\n'); allCatClusters.append(list(qIdSet)) if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(allCatClusters,featMan) print 'COUNTS ',termCount, len(allCatClusters), \ len(predictedSamePairsSet), len(catQueryDist) #metrics[termCount] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[termCount] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def printCategoryQueryDictionary(fileName, clusFile, weightFile): featMan = FeatureManager() featMan.readFeatures(fileName) categoryDictionary = {} for query, feat in featMan.iterFeatures(): catDict = feat.returnFeature('cat') for entry in catDict: if entry not in categoryDictionary: categoryDictionary[entry] = set() categoryDictionary[entry].add(query) outC = open(clusFile,'w') outW = open(weightFile,'w') for entry, qlist in categoryDictionary.items(): outC.write(toString(qlist,featMan)+'\n') outW.write(str(qlist)+'\n') outC.close() weightMatrix = {} cc = 0 #calculate the weight matrix for entry, qlist in categoryDictionary.items(): sort = sorted(qlist) for i in range(len(sort)-1): qid1, qf1 = featMan.returnFeature(sort[i]) if qf1: if sort[i] not in weightMatrix: weightMatrix[sort[i]] = {} for j in range(i+1,len(sort)): qid2, qf2 = featMan.returnFeature(sort[j]) if qf2: if sort[j] not in weightMatrix[sort[i]]: qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2) qjac = qf1.findJacardDistance(qf2) #qedit = qf1.findEditDistance(qf2) #normalized distance #dist = (j - i)#*1.0/len(session) edgeScore = (.25*((qcos + qjac )/2.0) +\ .15*ngramCos + .15*ucos + \ .15*userCos + .15*entCos + .15*catCos) if edgeScore > 0.05: weightMatrix[sort[i]][sort[j]] = edgeScore if cc % 10000==0: print cc cc+=1 outW.write('\n') for entry1, scoreList in weightMatrix.items(): for entry2, score in scoreList.items(): outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n'); outW.close();
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \ samePairsSet, differentPairsSet, catQueryDist, \ outFile = 'cat-clusters-with-med.txt'): oFile = open(outFile,'w') metrics = {} for noTerms in range(lowerLimit, upperLimit): #fclusters = [] cluster_list = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 qList = sorted(list(qSet),reverse=True) catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods') clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) clusters = {} for c in range(1, len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c-1]) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) #fclusters.append(qStr) oFile.write(cat+'\t'+qStr+'\n'); print 'Clust category',cat, 'length', len(clusters),\ 'Queries' , len(qSet),'k', k, 'error', error, opt if i % 5 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) #metrics[noTerms] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ predictedSamePairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def clusterCatWithMediodsAndNetwork(threshold, \ lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist, \ catNetwork, \ outFile = 'cat-clusters-with-med.txt'): #cluster each cat find the outliers #move them to parents metrics = {} for noTerms in range(lowerLimit, upperLimit, 2): cluster_list = [] #fclusters = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 #print cat, len(qSet), k qList = list(qSet) catDist = getWeightMatrixForKMed(qList, weightMatrix) clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) #print 'Queries', qList clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c]) #outliers = getOutliers(qList,catDist) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) oFile.write(cat+'\t'+qStr+'\n'); #fclusters.append(qStr) print 'Clust ',cat, len(clusters), error, opt if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) key = str(threshold)+'_'+str(noTerms) metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\ predictedSamePairsSet,\ predictedDifferentPairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
def clusterWithNetwork(self, featMan, weightMatrix, cNetwork, cDist, clambda, cdiff): #for each category cluster oFile = open('DPClusters-cat-dist.combined_' + str(clambda) + '.txt', 'w') clusterPointDist = {} cati = 1 for cat, queryList in cDist.items(): #print cat, len(queryList) if len(queryList) > 3: cluster, mean = self.cluster(queryList, weightMatrix, clambda, cdiff) toWriteClus = {} print 'Cat', cat, len(cluster), len(queryList) for d, c in cluster.items(): if c not in toWriteClus: toWriteClus[c] = [] toWriteClus[c].append(d) for c, entry in toWriteClus.items(): le = len(entry) if le > 0: cStr = toString(entry, featMan).strip() if len(cStr) > 0: oFile.write(cat + '\t' + cStr + '\n') if le not in clusterPointDist: clusterPointDist[le] = 0 clusterPointDist[le] += 1 cati += 1 if cati % 5000 == 0: print cati #break; oFile.close() for entry in sorted(clusterPointDist.items(), reverse=True, key=lambda x: x[1]): print entry
def clusterAllWithKMediods(lowerLimit, upperLimit,\ featMan, weightMatrix, \ #weightFile, allTaskDict, samePairsSet, \ differentPairsSet, outDir): data = featMan.returnKeys() #weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(),\ # weightFile,data) weightList= getWeightMatrixForKMed(data, weightMatrix,'kmediods') print len(weightList) metrics = {} for k in range(lowerLimit,upperLimit,3): print 'Clustering with terms ', k cluster_list = [] i = k # (len(weightList)+1)/k if i == 0: i = 1 clusArray, error, opt = clust.kmedoids(weightList,i, 10, None) clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() try: clusters[clusId].add(c) except: print c #len(data) print 'Error and cluster length ' , error, len(clusters) '''for clid, ind in clusters.items(): print clid, ind for qind in sorted(ind): print 'query', featMan.returnQuery(qind), print for i1 in sorted(ind): for i2 in sorted(ind): print 'i1 and i2',i1, i2 if i1 in weightMatrix and i2 in weightMatrix[i1]: print i1, i2,'matrix', weightMatrix[i1][i2],(weightList[i2])[i1] ''' fname = outDir+'_'+str(i)+'.txt' oFile = open(fname,'w'); for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) oFile.write(qStr+'\n') oFile.close() predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) #metrics[k] = getRecallPrecision(samePairsSet, \ # differentPairsSet,\ # predictedSamePairsSet,\ # predictedDifferentPairsSet) # metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\ # predictedSamePairsSet) metrics[k] = getSetBasedLabelsAndMetric(cluster_list,\ allTaskDict, featMan) for tcount, met in metrics.items(): print tcount, met return metrics