def __init__(self, k=3, iterations=100, tol=0.001): self.k = k self.iterations = iterations self.tol = 0.001 self.centroids = {} self.classifications = {} self.distanceFinder = DistanceFinder()
class KMeans: def __init__(self, k=3, iterations=100, tol=0.001): self.k = k self.iterations = iterations self.tol = 0.001 self.centroids = {} self.classifications = {} self.distanceFinder = DistanceFinder() def __initializeCentroids(self, data): for i in range(self.k): self.centroids[i] = random.choice(data) def __initializeClassification(self): self.classifications = {} for i in range(self.k): self.classifications[i] = [] def __updateCentroids(self): for i in self.classifications: self.centroids[i] = np.average(self.classifications[i], axis=0) def __isOptimized(self, prevCentroids): for key, currCentroid in self.centroids.items(): prevCentroid = prevCentroids[key] tmp = np.sum((currCentroid - prevCentroid) / prevCentroid * 100.0) if tmp > self.tol: return False return True def fit(self, data): self.__initializeCentroids(data) for _ in range(self.iterations): self.__initializeClassification() for feature in data: distances = [] for centroid in self.centroids.values(): distances.append( np.linalg.norm( self.distanceFinder.euclidean(feature, centroid))) minIndex = distances.index(min(distances)) self.classifications[minIndex].append(feature) prevCentroids = dict(self.centroids) self.__updateCentroids() if self.__isOptimized(prevCentroids): break def predict(self, feature): distances = [] for centroid in self.centroids.values(): distances.append(self.distanceFinder.euclidean(feature, centroid)) return distances.index(min(distances))
def __init__(self, df): self.distanceFinder = DistanceFinder() distances = self.distanceFinder.euclideanAll(df) self.distanceMatrix = self.__convertToDict(distances) # A simple mock test from pratica-09 # self.distanceMatrix = { # 0: [0, 2, 6, 10, 9], # 1: [2, 0, 5, 9, 8], # 2: [6, 5, 0, 4, 5], # 3: [10, 9, 4, 0, 3], # 4: [9, 8, 5, 3, 0] # } self.dfDistance = pd.DataFrame(self.distanceMatrix) open("result.txt", "w").close() # Clearing the result file
def main(): method = sys.argv[1] filePath = sys.argv[2] csvManager = CSVManager() df = csvManager.read(filePath) cols_with_missing = [col for col in df.columns if df[col].isnull().any()] if cols_with_missing: print('Formatting missing values...') df = csvManager.replaceNan(df, cols_with_missing) else: print('Dont have missing values on this dataset') formattedCSV = csvManager.deleteClassColumns(df) distanceFinder = DistanceFinder() if method == "euclidean": print("Computing euclidean distance...") euclideanMatrix = distanceFinder.euclideanAll(formattedCSV) csvMatrix = csvManager.convertMatrixToCSV(euclideanMatrix) csvManager.writeCSV(csvMatrix, "./dataset/euclidean.csv") print("Eclidean distance created with success!") elif method == "manhattan": print("Computing manhattan distance...") manhattanMatrix = distanceFinder.manhattanAll(formattedCSV) csvMatrix = csvManager.convertMatrixToCSV(manhattanMatrix) csvManager.writeCSV(csvMatrix, "./dataset/manhattan.csv") print("Manhattan distance created with success!") else: print("Please enter with a valid input, 'euclidean' or 'manhattan'")
class SimplifiedSilhouette: def __init__(self, df, kmeans): self.kmeans = kmeans self.df = df self.clusters = self.kmeans.centroids self.distanceFinder = DistanceFinder() self.__distanceMatrix = self.__getDistanceMatrix() def __getDistanceMatrix(self): distanceMatrix = [] for a in self.df.iloc: aux = [] for cluster in self.clusters.values(): aux.append(self.distanceFinder.euclidean(a.values, cluster)) distanceMatrix.append(aux) return distanceMatrix def __get_a(self, row): cluster = self.kmeans.predict(self.df.iloc[row]) return self.__distanceMatrix[row][cluster] def __get_b(self, row): cluster_id = self.kmeans.predict(self.df.iloc[row]) minDist = 9999999999 for cluster in range(len(self.__distanceMatrix[row])): if cluster != cluster_id: minDist = min(minDist, self.__distanceMatrix[row][cluster]) return minDist def calculate(self): silhouette = [] ss = 0 for i in range(len(self.__distanceMatrix)): a = self.__get_a(i) b = self.__get_b(i) s_i = (b - a) / max(a, b) silhouette.append([a, b, s_i]) ss += s_i return ss / len(silhouette)
class SingleLink: def __init__(self, df): self.distanceFinder = DistanceFinder() distances = self.distanceFinder.euclideanAll(df) self.distanceMatrix = self.__convertToDict(distances) # A simple mock test from pratica-09 # self.distanceMatrix = { # 0: [0, 2, 6, 10, 9], # 1: [2, 0, 5, 9, 8], # 2: [6, 5, 0, 4, 5], # 3: [10, 9, 4, 0, 3], # 4: [9, 8, 5, 3, 0] # } self.dfDistance = pd.DataFrame(self.distanceMatrix) open("result.txt", "w").close() # Clearing the result file def __combine(self, i, j): return str(i) + ', ' + str(j) def SingleLink(self): while len(self.dfDistance) > 2: # print(self.dfDistance, end="\n\n") self.__printDf() i, j = self.__posMinValue() # Find cluster (i, j) to join newDistancesMatrix = pd.DataFrame() newDistancesMatrix[self.__combine( i, j )] = [ # Create a new distances matrix with new joined cluster 0 ] * (len(self.dfDistance) - 1) for column in self.dfDistance.columns: if column not in (i, j): newDistancesMatrix[column] = [0] * (len(self.dfDistance) - 1) # Rename the rows with neu joined clusters newDistancesMatrix = self.__renameRowsDf(newDistancesMatrix) for k in newDistancesMatrix.columns: for k2 in newDistancesMatrix.columns: if k == k2: # If clusters are the same, add 0 newDistancesMatrix[k][k2] = 0 else: if type(k) is str: # Case where we need to find minDistance beetwen merged clusters, example: (i, j) = (1, 2), k = (1, 2) and k2 = 3 then minValue = min((1, 3), (2, 3)) if (i in eval(k)) or (j in eval(k)): minV = min(self.dfDistance[i][k2], self.dfDistance[j][k2]) newDistancesMatrix[k][k2] = minV # If current cluster is not in a joined cluster, just add the previous value elif k in self.dfDistance.columns and k2 in self.dfDistance.columns: newDistancesMatrix[k][k2] = self.dfDistance[k][ k2] if type(k2) is str: if (i in eval(k2)) or (j in eval(k2)): minV = min(self.dfDistance[i][k], self.dfDistance[j][k]) newDistancesMatrix[k][k2] = minV # If current cluster is not in a joined cluster, just add the previous value elif k in self.dfDistance.columns and k2 in self.dfDistance.columns: newDistancesMatrix[k][k2] = self.dfDistance[k][ k2] elif type(k) is int and type(k2) is int: # If current cluster is not in a joined cluster, just add the previous value newDistancesMatrix[k][k2] = self.dfDistance[k][k2] # print(newDistancesMatrix) self.dfDistance = newDistancesMatrix self.__printDf() return self.dfDistance def __renameRowsDf(self, newDistancesMatrix): index = 0 aux = {} for column in newDistancesMatrix.columns: aux[index] = column index += 1 return newDistancesMatrix.rename(index=aux) def __posMinValue(self): minV = math.inf for i in self.dfDistance: for j in self.dfDistance[i].index: if i != j and self.dfDistance[i][j] < minV: minV = self.dfDistance[i][j] res = (i, j) return res def __convertToDict(self, matrix): return {idx: val for idx, val in enumerate(matrix)} def __printDf(self): tmp = "" for cluster in self.dfDistance.columns: tmp += "{ " + str(cluster) + " }, " print("Hierarchy: ", tmp, end="\n\n") with open("result.txt", "a") as file: file.write(tmp + "\n")
def __init__(self, df, kmeans): self.kmeans = kmeans self.df = df self.clusters = self.kmeans.centroids self.distanceFinder = DistanceFinder() self.__distanceMatrix = self.__getDistanceMatrix()