def testComputeOverlapsWithDiagonal(self): data = scipy.sparse.csr_matrix([ [1, 1, 0, 1], [0, 1, 1, 0], [1, 1, 1, 1] ]) dists = HierarchicalClustering._computeOverlaps(data, selfOverlaps=True) self.assertEqual(dists.shape, (6,)) self.assertEqual(dists.tolist(), [3, 1, 3, 2, 2, 4])
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn): sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn) covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0) u, s, v = numpy.linalg.svd(covarianceMatrix) projectionMatrix = numpy.dot(u[:, :2], numpy.diag(s[:2])) projectedData = sparseDataMatrix.dot(projectionMatrix) categoryCounts = [] for document in documentCategoryMap: for category in documentCategoryMap[document]: if category >= len(categoryCounts): categoryCounts.extend([0] * (category - len(categoryCounts) + 1)) categoryCounts[category] += 1 categoryCounts = numpy.array(categoryCounts) colorSequenceBucket = [] for docId in documentCategoryMap: buckets = documentCategoryMap[docId] counts = categoryCounts[buckets] maxBucketIndex = numpy.argmax(counts) maxBucket = buckets[maxBucketIndex] colorSequenceBucket.append(maxBucket) plt.figure() plt.subplot(121, aspect="equal") plt.title("Bucket labels (%s)" % (args.modelName, )) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:, 1], projectedData[:, 0], c=colorSequenceBucket) colorSequenceClusters = numpy.zeros(len(colorSequenceBucket)) clusterId = 0 for dataIndices in protos: colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId clusterId += 1 plt.subplot(122, aspect="equal") plt.title("Clusters (%s)" % (args.modelName, )) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:, 1], projectedData[:, 0], c=colorSequenceClusters) plt.savefig(os.path.join(SAVE_PATH, "scatter.png")) plt.figure() plt.plot(s[:250]) plt.xlabel("Singular value #") plt.ylabel("Singular value") plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))
def testGetPrototypes(self): data = scipy.sparse.csr_matrix([ [1, 1, 0, 1], [1, 0, 1, 1], [0, 1, 1, 0], [1, 1, 1, 1] ]) overlaps = HierarchicalClustering._computeOverlaps(data) prototypes = HierarchicalClustering._getPrototypes([0, 1, 2, 3], overlaps) self.assertEqual(set(prototypes.tolist()), set([3])) prototypes = HierarchicalClustering._getPrototypes([1, 2, 3], overlaps, 2) self.assertEqual(set(prototypes.tolist()), set([3, 1])) prototypes = HierarchicalClustering._getPrototypes([0, 2, 3], overlaps, 2) self.assertEqual(set(prototypes.tolist()), set([3, 0])) prototypes = HierarchicalClustering._getPrototypes([0, 1, 2], overlaps, 2) self.assertEqual(set(prototypes.tolist()), set([0, 1]))
def testCondensedIndex(self): flat = range(6) # first try only indexing upper triangular region indicesA = [0, 0, 0, 1, 1, 2] indicesB = [1, 2, 3, 2, 3, 3] res = HierarchicalClustering._condensedIndex(indicesA, indicesB, 4) self.assertEqual(res.tolist(), flat) # ensure we get same result by transposing some indices for the lower # triangular region indicesA = [0, 2, 3, 1, 3, 2] indicesB = [1, 0, 0, 2, 1, 3] res = HierarchicalClustering._condensedIndex(indicesA, indicesB, 4) self.assertEqual(res.tolist(), flat) # finally check that we get an assertion error if we try accessing # an element from the diagonal with self.assertRaises(AssertionError): indicesA = [0, 2, 0, 1, 3, 2] indicesB = [1, 2, 3, 2, 1, 3] _ = HierarchicalClustering._condensedIndex(indicesA, indicesB, 4)
def testExtractVectorsFromKNN(self): vectors = numpy.random.rand(10, 25) < 0.1 # Populate KNN knn = KNNClassifier() for i in xrange(vectors.shape[0]): knn.learn(vectors[i], 0) # Extract vectors from KNN sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn) self.assertEqual( sorted(sparseDataMatrix.todense().tolist()), sorted(vectors.tolist()) )
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn): sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn) covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0) u, s, v = numpy.linalg.svd(covarianceMatrix) projectionMatrix = numpy.dot(u[:,:2], numpy.diag(s[:2])) projectedData = sparseDataMatrix.dot(projectionMatrix) categoryCounts = [] for document in documentCategoryMap: for category in documentCategoryMap[document]: if category >= len(categoryCounts): categoryCounts.extend([0] * (category - len(categoryCounts) + 1)) categoryCounts[category] += 1 categoryCounts = numpy.array(categoryCounts) colorSequenceBucket = [] for docId in documentCategoryMap: buckets = documentCategoryMap[docId] counts = categoryCounts[buckets] maxBucketIndex = numpy.argmax(counts) maxBucket = buckets[maxBucketIndex] colorSequenceBucket.append(maxBucket) plt.figure() plt.subplot(121, aspect="equal") plt.title("Bucket labels (%s)" % (args.modelName,)) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceBucket) colorSequenceClusters = numpy.zeros(len(colorSequenceBucket)) clusterId = 0 for dataIndices in protos: colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId clusterId += 1 plt.subplot(122, aspect="equal") plt.title("Clusters (%s)" % (args.modelName,)) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceClusters) plt.savefig(os.path.join(SAVE_PATH, "scatter.png")) plt.figure() plt.plot(s[:250]) plt.xlabel("Singular value #") plt.ylabel("Singular value") plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs,) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs, ) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)