예제 #1
0
def subComparison(folder1, folder2, stringDiffParam, bagWordParam, apiCallParam, dancingBunnyParam):		
	min = float('inf')
	clust = -1
	foundClust = []
	for file1 in os.listdir(folder1):
		currClust = 0	
		for file2 in os.listdir(folder2):
			dist = fileComparison(folder1 + file1, folder2 + file2, 
				stringDiffParam, bagWordParam, apiCallParam, dancingBunnyParam)
			if dist < min:
				min = dist
				clust = currClust
			currClust += 1
		if clust not in foundClust:
			foundClust.append(clust)
	print "Number of clusters: " + str(len(foundClust))
	return foundClust
예제 #2
0
def cluster():
    m = numpy.array([[0 for i in rangeFiles] for j in rangeFiles])

    for i in rangeFiles:
        for j in range(i + 1, numFiles):
            distance = float(
                fileComparison(
                    folder + filesToCluster[i],
                    folder + filesToCluster[j],
                    stringDiffParam,
                    bagWordParam,
                    apiCallParam,
                    dancingBunnyParam,
                )
            )
            # Square to minimize quadratic error
            # distanceSquared = distance * distance
            m[i][j] = distance
            m[j][i] = distance

    print "Distances matrix calculated"
    print "Median of matrix: " + str(numpy.median(m))

    labels, clusters = kmedoids(m)
    print "Labels:"
    print labels
    print "Clusters:"
    print clusters
    print "K-medoids run, number of clusters: " + str(len(clusters))

    moveFiles(filesToCluster, labels, clusters, "KMedoids/")

    # Negate since affinity propagation works on negative distance values
    m = -m
    gc.collect()
    labels, clusters = affinityPropagation(m)
    print "Affinity propagation run, number of clusters: " + str(len(clusters))

    moveFiles(filesToCluster, labels, clusters, "AffinityPropagation/")