def getEvaluationMetrics(self, documentClusters, timeDifference):
     iterationData =  {'no_of_documents':self.length, 'no_of_clusters': len(documentClusters), 'iteration_time': timeDifference, 'clusters': documentClusters}
     clustersForEvaluation = [self._getExpertClasses(cluster) for cluster in documentClusters]
     iterationData['nmi'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.nmi)
     iterationData['purity'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.purity)
     iterationData['f1'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.f1)
     return iterationData
 def getEvaluationMetrics(self, documentClusters, timeDifference, iteration_parameters):
     iterationData =  {'iteration_parameters': iteration_parameters, 'no_of_clusters': len(documentClusters), 'iteration_time': timeDifference, 'clusters': documentClusters}
     clustersForEvaluation = [self._getExpertClasses(cluster) for cluster in documentClusters]
     iterationData['nmi'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.nmi)
     iterationData['purity'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.purity)
     iterationData['f1'] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.f1)
     return iterationData
 def getEvaluationMetrics(self, documentClusters, timeDifference, iteration_parameters):
     iterationData = {
         "iteration_parameters": iteration_parameters,
         "no_of_clusters": len(documentClusters),
         "iteration_time": timeDifference,
         "clusters": documentClusters,
     }
     clustersForEvaluation = [self._getExpertClasses(cluster) for cluster in documentClusters]
     iterationData["nmi"] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.nmi)
     iterationData["purity"] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.purity)
     iterationData["f1"] = EvaluationMetrics.getValueForClusters(clustersForEvaluation, EvaluationMetrics.f1)
     return iterationData
Пример #4
0
 def getEvaluationMetrics(noOfTweets, documentClusters, timeDifference):
     iterationData = {
         'no_of_tweets': noOfTweets,
         'no_of_clusters': len(documentClusters),
         'iteration_time': timeDifference,
         'clusters': documentClusters
     }
     clustersForEvaluation = [
         Evaluation.getExpertClasses(cluster)
         for cluster in documentClusters
     ]
     iterationData['nmi'] = EvaluationMetrics.getValueForClusters(
         clustersForEvaluation, EvaluationMetrics.nmi)
     iterationData['purity'] = EvaluationMetrics.getValueForClusters(
         clustersForEvaluation, EvaluationMetrics.purity)
     iterationData['f1'] = EvaluationMetrics.getValueForClusters(
         clustersForEvaluation, EvaluationMetrics.f1)
     return iterationData
Пример #5
0
    def getCrowdQuality(self, evaluationMetric, expertsToClassMap):
        def getExpertClasses(cluster):
            return [
                expertsToClassMap[user.lower()]
                for user in cluster.documentsInCluster
                if user.lower() in expertsToClassMap
            ]

        return EvaluationMetrics.getValueForClusters(
            [getExpertClasses(cluster) for cluster in self.clusters.itervalues()], evaluationMetric
        )
Пример #6
0
    def getCrowdQuality(self, evaluationMetric, expertsToClassMap):
        def getExpertClasses(cluster):
            return [
                expertsToClassMap[user.lower()]
                for user in cluster.documentsInCluster
                if user.lower() in expertsToClassMap
            ]

        return EvaluationMetrics.getValueForClusters([
            getExpertClasses(cluster)
            for cluster in self.clusters.itervalues()
        ], evaluationMetric)
Пример #7
0
def streamingLSHClusteringDemo():
    clustering_settings = {'dimensions': 53,
                            'signature_length': 13,
                            'number_of_permutations': 5,
                            'threshold_for_document_to_be_in_cluster': 0.2}
    clustering=StreamingLSHClustering(**clustering_settings)
    docId = 0
    docsToOriginalClusterMap = {}
    for line in FileIO.iterateLinesFromFile('../data/streaming.dat'):
        document = createDocumentFromLine(docId, line)
        docsToOriginalClusterMap[docId] = document.clusterId
        docId+=1
        clustering.getClusterAndUpdateExistingClusters(document)
    clusterLabels = []
    for k, cluster in clustering.clusters.iteritems(): clusterLabels.append([docsToOriginalClusterMap[doc.docId] for doc in cluster.iterateDocumentsInCluster()])
    return EvaluationMetrics.getValueForClusters(clusterLabels, EvaluationMetrics.purity)
Пример #8
0
def offlineLSHClusteringDemo():
    wordToDimensionMap = {}

    def createDocumentFromLine(docId, line):
        vector = Vector()
        words = line.split()
        for word in words[1:]:
            if word not in wordToDimensionMap:
                wordToDimensionMap[word] = len(wordToDimensionMap)
            wordDimension = wordToDimensionMap[word]
            if wordDimension not in vector: vector[wordDimension] = 1
            else: vector[wordDimension] += 1
        return Document(docId, vector, clusterId=words[0])

    dimensions = 53
    signatureLength = 13
    numberOfPermutations = 5

    unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1)
    vectorPermutations = VectorPermutation.getPermutations(
        signatureLength, dimensions, unitVector)
    signaturePermutations = [
        SignaturePermutationWithTrie(signatureLength)
        for i in range(numberOfPermutations)
    ]

    permutatedUnitVectors = [
        unitVector.getPermutedVector(r) for r in vectorPermutations
    ]

    # Build LSH Model.
    # Read training documents.
    traningDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/train_offline.dat')):
        traningDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Construct cluster vectors.
    clusterToDocumentsMap = defaultdict(list)
    for document in traningDocumentsMap.values():
        clusterToDocumentsMap[document.clusterId].append(document)
    clusterMap = {}
    for k, v in clusterToDocumentsMap.iteritems():
        clusterMap[k] = Document(docId=k,
                                 vector=Vector.getMeanVector(v),
                                 clusterId=k)

    # Create signatures and signaturePermutations for all the clusters.
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), clusterMap.values())
    for permutation in signaturePermutations:
        for document in clusterMap.values():
            permutation.addDocument(document)

    # Testing the model.
    # Read testing documents.
    testDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/test_offline.dat')):
        testDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Create signatures for test documents
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), testDocumentsMap.values())

    predicted, labels = [], []
    for t in testDocumentsMap.values():
        possibleNearestClusters = reduce(
            lambda x, y: x.union(y),
            (permutation.getNearestDocuments(t)
             for permutation in signaturePermutations), set())
        predictedClass = max(
            ((clusterId, clusterMap[clusterId].cosineSimilarity(t))
             for clusterId in possibleNearestClusters),
            key=itemgetter(1))
        predicted.append(predictedClass[0])
        labels.append(t.clusterId)
    return EvaluationMetrics.purity(predicted, labels)