def getEvaluationMetrics(self, documentClusters, timeDifference): iterationData = {'no_of_documents':self.length, 'no_of_clusters': len(documentClusters), 'iteration_time': timeDifference, 'clusters': documentClusters} clustersForEvaluation = [self._getExpertClasses(cluster) for cluster in documentClusters] iterationData['nmi'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.nmi) iterationData['purity'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.purity) iterationData['f1'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.f1) return iterationData
def getEvaluationMetrics(self, documentClusters, timeDifference, iteration_parameters): iterationData = {'iteration_parameters': iteration_parameters, 'no_of_clusters': len(documentClusters), 'iteration_time': timeDifference, 'clusters': documentClusters} clustersForEvaluation = [self._getExpertClasses(cluster) for cluster in documentClusters] iterationData['nmi'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.nmi) iterationData['purity'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.purity) iterationData['f1'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.f1) return iterationData
def getEvaluationMetrics(self, documentClusters, timeDifference, iteration_parameters): iterationData = { "iteration_parameters": iteration_parameters, "no_of_clusters": len(documentClusters), "iteration_time": timeDifference, "clusters": documentClusters, } clustersForEvaluation = [self._getExpertClasses(cluster) for cluster in documentClusters] iterationData["nmi"] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.nmi) iterationData["purity"] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.purity) iterationData["f1"] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.f1) return iterationData
def getEvaluationMetrics(noOfTweets, documentClusters, timeDifference): iterationData = { 'no_of_tweets': noOfTweets, 'no_of_clusters': len(documentClusters), 'iteration_time': timeDifference, 'clusters': documentClusters } clustersForEvaluation = [ Evaluation.getExpertClasses(cluster) for cluster in documentClusters ] iterationData['nmi'] = EvaluationMetrics.getValueForClusters( clustersForEvaluation, EvaluationMetrics.nmi) iterationData['purity'] = EvaluationMetrics.getValueForClusters( clustersForEvaluation, EvaluationMetrics.purity) iterationData['f1'] = EvaluationMetrics.getValueForClusters( clustersForEvaluation, EvaluationMetrics.f1) return iterationData
def getCrowdQuality(self, evaluationMetric, expertsToClassMap): def getExpertClasses(cluster): return [ expertsToClassMap[user.lower()] for user in cluster.documentsInCluster if user.lower() in expertsToClassMap ] return EvaluationMetrics.getValueForClusters( [getExpertClasses(cluster) for cluster in self.clusters.itervalues()], evaluationMetric )
def getCrowdQuality(self, evaluationMetric, expertsToClassMap): def getExpertClasses(cluster): return [ expertsToClassMap[user.lower()] for user in cluster.documentsInCluster if user.lower() in expertsToClassMap ] return EvaluationMetrics.getValueForClusters([ getExpertClasses(cluster) for cluster in self.clusters.itervalues() ], evaluationMetric)
def streamingLSHClusteringDemo(): clustering_settings = {'dimensions': 53, 'signature_length': 13, 'number_of_permutations': 5, 'threshold_for_document_to_be_in_cluster': 0.2} clustering=StreamingLSHClustering(**clustering_settings) docId = 0 docsToOriginalClusterMap = {} for line in FileIO.iterateLinesFromFile('../data/streaming.dat'): document = createDocumentFromLine(docId, line) docsToOriginalClusterMap[docId] = document.clusterId docId+=1 clustering.getClusterAndUpdateExistingClusters(document) clusterLabels = [] for k, cluster in clustering.clusters.iteritems(): clusterLabels.append([docsToOriginalClusterMap[doc.docId] for doc in cluster.iterateDocumentsInCluster()]) return EvaluationMetrics.getValueForClusters(clusterLabels, EvaluationMetrics.purity)
def offlineLSHClusteringDemo(): wordToDimensionMap = {} def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0]) dimensions = 53 signatureLength = 13 numberOfPermutations = 5 unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) signaturePermutations = [ SignaturePermutationWithTrie(signatureLength) for i in range(numberOfPermutations) ] permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] # Build LSH Model. # Read training documents. traningDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/train_offline.dat')): traningDocumentsMap[docId] = createDocumentFromLine(docId, l) # Construct cluster vectors. clusterToDocumentsMap = defaultdict(list) for document in traningDocumentsMap.values(): clusterToDocumentsMap[document.clusterId].append(document) clusterMap = {} for k, v in clusterToDocumentsMap.iteritems(): clusterMap[k] = Document(docId=k, vector=Vector.getMeanVector(v), clusterId=k) # Create signatures and signaturePermutations for all the clusters. map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), clusterMap.values()) for permutation in signaturePermutations: for document in clusterMap.values(): permutation.addDocument(document) # Testing the model. # Read testing documents. testDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/test_offline.dat')): testDocumentsMap[docId] = createDocumentFromLine(docId, l) # Create signatures for test documents map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), testDocumentsMap.values()) predicted, labels = [], [] for t in testDocumentsMap.values(): possibleNearestClusters = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(t) for permutation in signaturePermutations), set()) predictedClass = max( ((clusterId, clusterMap[clusterId].cosineSimilarity(t)) for clusterId in possibleNearestClusters), key=itemgetter(1)) predicted.append(predictedClass[0]) labels.append(t.clusterId) return EvaluationMetrics.purity(predicted, labels)