def clusterAllWithKMeans(lowerLimit, upperLimit, featMan, weightMatrix,\
					samePairsSet, differentPairsSet, outDir):
	metrics = {}
	print 'Weight matrix length' ,len(weightMatrix)

	data = featMan.returnKeys()
	for k in range(lowerLimit,upperLimit,2):
		i = len(data)/k
		if i == 0:
			i = 1
		kmeans = KMeans(i,data,weightMatrix,5, 0.1)
		kmeans.cluster();
		clusters = kmeans.getClusters();
		noClus =kmeans.getTermInNoCluster();
		
		if clusters:
                        print 'Found clusters length', len(clusters),'singaltons', len(noClus)
			predictedSamePairsSet, predictedDifferentPairsSet = \
				getPairLabelsFromClusters(clusters,featMan)
			fname = outDir+'_'+str(i)+'.txt'
			oFile = open(fname,'w');
			for entry in clusters:
				if len(entry) > 0:
					oFile.write(toString(entry,featMan)+'\n')
			oFile.write('NO CLUST\t'+toString(noClus,featMan)+'\n');
			oFile.close()
                        print 'Same pair set', len(predictedSamePairsSet)
			#metrics[k] = getRecallPrecision(samePairsSet, \
			#differentPairsSet, predictedSamePairsSet, predictedDifferentPairsSet)
                        metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
def clusterCatWithKMeans(lowerLimit, upperLimit, featMan, \
						weightMatrix, samePairsSet, \
						differentPairsSet, catQueryDist,\
						outFile = 'cat-clusters-with-mean.txt'):
	metrics = {}
	for termCount in range(lowerLimit, upperLimit):
		i = 1
		fclusters = []
		allCatClusters = []
		oFile = open(outFile+'_'+str(termCount)+'.txt','w')
		for cat, qIdSet in catQueryDist.items():
			if len(qIdSet) > 1:
				k = len(qIdSet)/termCount
				if k == 0:
					k = 1
				print cat, len(qIdSet), k
				if k > 1:
					kmeans = KMeans(k,list(qIdSet),weightMatrix,5, 0.1)
					kmeans.cluster();
					clusters = kmeans.getClusters();
					noClus =kmeans.getTermInNoCluster();
					for entry in clusters:
						if len(entry) > 1:
							allCatClusters.append(entry)
						if len(entry) > 0:
							cStr = toString(entry,featMan)
							fclusters.append(cStr)
							oFile.write(cat+'\t'+cStr+'\n');
					oFile.write(cat+'\t'+'NO CLUST\t'+\
								toString(noClus,featMan)+'\n');
				else:
					cStr = toString(qIdSet,featMan)
					oFile.write(cat+'\t'+cStr+'\n');
					allCatClusters.append(list(qIdSet))
				
				if i % 50 == 0:
					print i
				i+=1	
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(allCatClusters,featMan)
		print 'COUNTS ',termCount, len(allCatClusters), \
		len(predictedSamePairsSet), len(catQueryDist)
		#metrics[termCount] = getRecallPrecision(samePairsSet, \
		#				differentPairsSet,\
		#				predictedSamePairsSet,\
		#				predictedDifferentPairsSet)	
                metrics[termCount] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)
		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
def printCategoryQueryDictionary(fileName, clusFile, weightFile):
	
	featMan = FeatureManager()
	
	featMan.readFeatures(fileName)
	categoryDictionary = {}
	for query, feat in featMan.iterFeatures():
		catDict = feat.returnFeature('cat')
		for entry in catDict:
			if entry not in categoryDictionary:
				categoryDictionary[entry] = set()
			categoryDictionary[entry].add(query)
		
	outC = open(clusFile,'w')
	outW = open(weightFile,'w')
	for entry, qlist in categoryDictionary.items():
		outC.write(toString(qlist,featMan)+'\n')
		outW.write(str(qlist)+'\n')
	outC.close()
	
			
	weightMatrix = {}
	cc = 0
	#calculate the weight matrix
	for entry, qlist in categoryDictionary.items():
		sort = sorted(qlist)
		for i in range(len(sort)-1):
			qid1, qf1 = featMan.returnFeature(sort[i])
			if qf1:
				if sort[i]  not in weightMatrix:
					weightMatrix[sort[i]] = {}
				for j in range(i+1,len(sort)):
					qid2, qf2 = featMan.returnFeature(sort[j])
					if qf2:
						if sort[j] not in weightMatrix[sort[i]]:
							qcos, ucos, userCos, ngramCos, entCos, catCos = qf1.findCosineDistance(qf2)
							qjac = qf1.findJacardDistance(qf2)
							#qedit = qf1.findEditDistance(qf2)
							#normalized distance
							#dist = (j - i)#*1.0/len(session)
							
							edgeScore = (.25*((qcos + qjac )/2.0) +\
							.15*ngramCos + .15*ucos + \
							.15*userCos + .15*entCos + .15*catCos)
					
							if edgeScore > 0.05:
								weightMatrix[sort[i]][sort[j]] = edgeScore
		if cc % 10000==0:
			print cc
		cc+=1
		
	outW.write('\n')
	
	
	for entry1, scoreList in weightMatrix.items():
		for entry2, score in scoreList.items():
			outW.write(str(entry1)+' '+str(entry2)+' '+str(score)+'\n');
	outW.close();
def clusterCatWithMediods(lowerLimit, upperLimit,featMan, weightMatrix, \
						 samePairsSet, differentPairsSet, catQueryDist, \
						outFile = 'cat-clusters-with-med.txt'):
	
	oFile = open(outFile,'w')
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit):
		#fclusters = []
		cluster_list = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
			
				qList = sorted(list(qSet),reverse=True)
				catDist = getWeightMatrixForKMed(qList, weightMatrix,'cat_kmediods')
							
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				clusters = {}
				for c in range(1, len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c-1])

				
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					#fclusters.append(qStr)
					oFile.write(cat+'\t'+qStr+'\n');
				print 'Clust category',cat, 'length', len(clusters),\
                                        'Queries' , len(qSet),'k', k,  'error', error, opt
				if i % 5 == 0:
					print i
				i+=1	
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		#metrics[noTerms] = getRecallPrecision(samePairsSet, \
		#			differentPairsSet,\
		#			predictedSamePairsSet,\
		#			predictedDifferentPairsSet)	
                metrics[noTerms] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                                predictedSamePairsSet)

		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
def clusterCatWithMediodsAndNetwork(threshold, \
				    lowerLimit, upperLimit, featMan, \
				    weightMatrix, samePairsSet, \
				    differentPairsSet, catQueryDist, \
				    catNetwork, \
				    outFile = 'cat-clusters-with-med.txt'):
	#cluster each cat find the outliers
	#move them to parents
	metrics = {}
	for noTerms in range(lowerLimit, upperLimit, 2):
		cluster_list = []
		#fclusters = []
		i = 0
		oFile = open(outFile+str(noTerms)+'.txt','w')
		for cat, qSet in catQueryDist.items():
			if len(qSet) > 1: # and cat in pairs:
				k = len(qSet)/noTerms
				if k == 0:
					k = 1
				#print cat, len(qSet), k
				qList = list(qSet)
				catDist = getWeightMatrixForKMed(qList, weightMatrix)
				clusArray, error, opt = clust.kmedoids(catDist,k, 5, None)
				#print 'Queries', qList
				clusters = {}
				for c in range(len(clusArray)):
					clusId = clusArray[c]
					if clusId not in clusters:
						clusters[clusId] = set()
					clusters[clusId].add(qList[c])
				#outliers = getOutliers(qList,catDist)
				for entry in clusters.values():
					cluster_list.append(list(entry))
					qStr = toString(entry,featMan)
					oFile.write(cat+'\t'+qStr+'\n');
					#fclusters.append(qStr)
				print 'Clust ',cat, len(clusters), error, opt
				if i % 50 == 0:
					print i
				i+=1
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		key = str(threshold)+'_'+str(noTerms)
		metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\
			     		            predictedSamePairsSet,\
			     		            predictedDifferentPairsSet)
		oFile.close()
	for tcount, met in metrics.items():
		print tcount, met
	return metrics
Пример #6
0
  def clusterWithNetwork(self, featMan, weightMatrix, cNetwork, cDist, clambda,
                         cdiff):
    #for each category cluster
    oFile = open('DPClusters-cat-dist.combined_' + str(clambda) + '.txt', 'w')
    clusterPointDist = {}
    cati = 1
    for cat, queryList in cDist.items():
      #print cat, len(queryList)
      if len(queryList) > 3:
        cluster, mean = self.cluster(queryList, weightMatrix, clambda, cdiff)
        toWriteClus = {}
        print 'Cat', cat, len(cluster), len(queryList)
        for d, c in cluster.items():
          if c not in toWriteClus:
            toWriteClus[c] = []
          toWriteClus[c].append(d)
        for c, entry in toWriteClus.items():
          le = len(entry)
          if le > 0:
            cStr = toString(entry, featMan).strip()
            if len(cStr) > 0:
              oFile.write(cat + '\t' + cStr + '\n')
          if le not in clusterPointDist:
            clusterPointDist[le] = 0
          clusterPointDist[le] += 1

      cati += 1
      if cati % 5000 == 0:
        print cati
        #break;

    oFile.close()
    for entry in sorted(clusterPointDist.items(),
                        reverse=True,
                        key=lambda x: x[1]):
      print entry
def clusterAllWithKMediods(lowerLimit, upperLimit,\
				 featMan, weightMatrix, \
                                 #weightFile,
                                 allTaskDict, 
                                 samePairsSet, \
				 differentPairsSet, outDir):
	
	data = featMan.returnKeys()
	#weightList = getWeightMatrixForKMedFromFile(featMan.returnLastId(),\
        #        weightFile,data)
	weightList= getWeightMatrixForKMed(data, weightMatrix,'kmediods')
	print len(weightList)
	metrics = {}
	
	for k in range(lowerLimit,upperLimit,3):
		print 'Clustering with terms ', k
		cluster_list = []
		i = k # (len(weightList)+1)/k
		if i == 0:
			i = 1
		clusArray, error, opt = clust.kmedoids(weightList,i, 10, None)
		clusters = {}
		for c in range(len(clusArray)):
			clusId = clusArray[c]
			if clusId not in clusters:
				clusters[clusId] = set()
			try:
				clusters[clusId].add(c)	
			except:
				print c #len(data)
		print 'Error and cluster length ' , error, len(clusters)
                '''for clid, ind in clusters.items():
                    print clid, ind
                    for qind in sorted(ind):
                        print 'query', featMan.returnQuery(qind),
                    print
                    for i1 in sorted(ind):
                        for i2 in sorted(ind):
                            print 'i1 and i2',i1, i2
                            if i1 in weightMatrix and i2 in weightMatrix[i1]:
                                print i1, i2,'matrix', weightMatrix[i1][i2],(weightList[i2])[i1]
                '''
		fname = outDir+'_'+str(i)+'.txt'
		oFile = open(fname,'w');
		for entry in clusters.values():
			cluster_list.append(list(entry))
			qStr = toString(entry,featMan)	
			oFile.write(qStr+'\n')
		oFile.close()
		predictedSamePairsSet, predictedDifferentPairsSet = \
						getPairLabelsFromClusters(cluster_list,featMan)
		#metrics[k] = getRecallPrecision(samePairsSet, \
		#				differentPairsSet,\
		#				predictedSamePairsSet,\
		#				predictedDifferentPairsSet)	
                # metrics[k] = getSamePairPrecisionRecallF1Calculator(samePairsSet,\
                #                predictedSamePairsSet)
                metrics[k] = getSetBasedLabelsAndMetric(cluster_list,\
                        allTaskDict, featMan)

	for tcount, met in metrics.items():
		print tcount, met
		
	return metrics