def subComparison(folder1, folder2, stringDiffParam, bagWordParam, apiCallParam, dancingBunnyParam): min = float('inf') clust = -1 foundClust = [] for file1 in os.listdir(folder1): currClust = 0 for file2 in os.listdir(folder2): dist = fileComparison(folder1 + file1, folder2 + file2, stringDiffParam, bagWordParam, apiCallParam, dancingBunnyParam) if dist < min: min = dist clust = currClust currClust += 1 if clust not in foundClust: foundClust.append(clust) print "Number of clusters: " + str(len(foundClust)) return foundClust
def cluster(): m = numpy.array([[0 for i in rangeFiles] for j in rangeFiles]) for i in rangeFiles: for j in range(i + 1, numFiles): distance = float( fileComparison( folder + filesToCluster[i], folder + filesToCluster[j], stringDiffParam, bagWordParam, apiCallParam, dancingBunnyParam, ) ) # Square to minimize quadratic error # distanceSquared = distance * distance m[i][j] = distance m[j][i] = distance print "Distances matrix calculated" print "Median of matrix: " + str(numpy.median(m)) labels, clusters = kmedoids(m) print "Labels:" print labels print "Clusters:" print clusters print "K-medoids run, number of clusters: " + str(len(clusters)) moveFiles(filesToCluster, labels, clusters, "KMedoids/") # Negate since affinity propagation works on negative distance values m = -m gc.collect() labels, clusters = affinityPropagation(m) print "Affinity propagation run, number of clusters: " + str(len(clusters)) moveFiles(filesToCluster, labels, clusters, "AffinityPropagation/")