def clusterCatWithMediodsAndNetwork(threshold, \ lowerLimit, upperLimit, featMan, \ weightMatrix, samePairsSet, \ differentPairsSet, catQueryDist, \ catNetwork, \ outFile = 'cat-clusters-with-med.txt'): #cluster each cat find the outliers #move them to parents metrics = {} for noTerms in range(lowerLimit, upperLimit, 2): cluster_list = [] #fclusters = [] i = 0 oFile = open(outFile+str(noTerms)+'.txt','w') for cat, qSet in catQueryDist.items(): if len(qSet) > 1: # and cat in pairs: k = len(qSet)/noTerms if k == 0: k = 1 #print cat, len(qSet), k qList = list(qSet) catDist = getWeightMatrixForKMed(qList, weightMatrix) clusArray, error, opt = clust.kmedoids(catDist,k, 5, None) #print 'Queries', qList clusters = {} for c in range(len(clusArray)): clusId = clusArray[c] if clusId not in clusters: clusters[clusId] = set() clusters[clusId].add(qList[c]) #outliers = getOutliers(qList,catDist) for entry in clusters.values(): cluster_list.append(list(entry)) qStr = toString(entry,featMan) oFile.write(cat+'\t'+qStr+'\n'); #fclusters.append(qStr) print 'Clust ',cat, len(clusters), error, opt if i % 50 == 0: print i i+=1 predictedSamePairsSet, predictedDifferentPairsSet = \ getPairLabelsFromClusters(cluster_list,featMan) key = str(threshold)+'_'+str(noTerms) metrics[key] = getRecallPrecision(samePairsSet, differentPairsSet,\ predictedSamePairsSet,\ predictedDifferentPairsSet) oFile.close() for tcount, met in metrics.items(): print tcount, met return metrics
else: print 'Query feature error ', session[i] sessCount += 1 labels = qcc.getTaskComponents() fname = args.outDir + '_'+args.algo+'_' + str(threshold) + '.txt' outFile = open(fname, 'w') for entry in labels: string = '' for qid in entry: string += featMan.returnQuery(qid) + '\t' outFile.write(string.strip() + '\n') outFile.close() predicted_same_pairs, predicted_different_pairs=\ getPairLabelsFromClusters(labels,featMan) metrics[threshold] = getRecallPrecision(samePairsSet, differentPairsSet, predicted_same_pairs, predicted_different_pairs) for tcount, met in metrics.items(): print tcount, met mergeMetrics(total_metrics_dict, metrics) computeAverageAndVarianceOfMetrics(args.algo, total_metrics_dict) #qcos, ucos, userCos, sessionCos, ngramCos, entCos, \ #catCos,typeCos = qf1.findCosineDistance(qf2) #qjac = qf1.findJacardDistance(qf2) ##qedit = qf1.findEditDistance(qf2) ##normalized distance ##dist = (j - i)#*1.0/len(session) ##oFile.write(str(qid1)+'\t'+str(qid2)+'\t'+\ ##str(round(qcos,2))+'\t'+str(round(qjac,2))+'\t'+\ ##str(round(ngramCos,2))+'\t'+str(round(userCos,2))+'\t' + \ ##str(round(entCos,2))+'\t'+ str(round(catCos,2))+\