Пример #1
0
 def __init__(self, k=3, iterations=100, tol=0.001):
     self.k = k
     self.iterations = iterations
     self.tol = 0.001
     self.centroids = {}
     self.classifications = {}
     self.distanceFinder = DistanceFinder()
Пример #2
0
class KMeans:
    def __init__(self, k=3, iterations=100, tol=0.001):
        self.k = k
        self.iterations = iterations
        self.tol = 0.001
        self.centroids = {}
        self.classifications = {}
        self.distanceFinder = DistanceFinder()

    def __initializeCentroids(self, data):
        for i in range(self.k):
            self.centroids[i] = random.choice(data)

    def __initializeClassification(self):
        self.classifications = {}
        for i in range(self.k):
            self.classifications[i] = []

    def __updateCentroids(self):
        for i in self.classifications:
            self.centroids[i] = np.average(self.classifications[i], axis=0)

    def __isOptimized(self, prevCentroids):
        for key, currCentroid in self.centroids.items():
            prevCentroid = prevCentroids[key]
            tmp = np.sum((currCentroid - prevCentroid) / prevCentroid * 100.0)
            if tmp > self.tol:
                return False
        return True

    def fit(self, data):
        self.__initializeCentroids(data)

        for _ in range(self.iterations):
            self.__initializeClassification()

            for feature in data:
                distances = []
                for centroid in self.centroids.values():
                    distances.append(
                        np.linalg.norm(
                            self.distanceFinder.euclidean(feature, centroid)))
                minIndex = distances.index(min(distances))
                self.classifications[minIndex].append(feature)

            prevCentroids = dict(self.centroids)

            self.__updateCentroids()

            if self.__isOptimized(prevCentroids):
                break

    def predict(self, feature):
        distances = []
        for centroid in self.centroids.values():
            distances.append(self.distanceFinder.euclidean(feature, centroid))

        return distances.index(min(distances))
Пример #3
0
    def __init__(self, df):
        self.distanceFinder = DistanceFinder()
        distances = self.distanceFinder.euclideanAll(df)
        self.distanceMatrix = self.__convertToDict(distances)
        # A simple mock test from pratica-09

        # self.distanceMatrix = {
        #     0: [0, 2, 6, 10, 9],
        #     1: [2, 0, 5, 9, 8],
        #     2: [6, 5, 0, 4, 5],
        #     3: [10, 9, 4, 0, 3],
        #     4: [9, 8, 5, 3, 0]
        # }

        self.dfDistance = pd.DataFrame(self.distanceMatrix)
        open("result.txt", "w").close()  # Clearing the result file
Пример #4
0
def main():
    method = sys.argv[1]
    filePath = sys.argv[2]

    csvManager = CSVManager()
    df = csvManager.read(filePath)

    cols_with_missing = [col for col in df.columns if df[col].isnull().any()]

    if cols_with_missing:
        print('Formatting missing values...')
        df = csvManager.replaceNan(df, cols_with_missing)
    else:
        print('Dont have missing values on this dataset')

    formattedCSV = csvManager.deleteClassColumns(df)

    distanceFinder = DistanceFinder()

    if method == "euclidean":
        print("Computing euclidean distance...")

        euclideanMatrix = distanceFinder.euclideanAll(formattedCSV)
        csvMatrix = csvManager.convertMatrixToCSV(euclideanMatrix)
        csvManager.writeCSV(csvMatrix, "./dataset/euclidean.csv")

        print("Eclidean distance created with success!")
    elif method == "manhattan":
        print("Computing manhattan distance...")

        manhattanMatrix = distanceFinder.manhattanAll(formattedCSV)
        csvMatrix = csvManager.convertMatrixToCSV(manhattanMatrix)
        csvManager.writeCSV(csvMatrix, "./dataset/manhattan.csv")

        print("Manhattan distance created with success!")
    else:
        print("Please enter with a valid input, 'euclidean' or 'manhattan'")
Пример #5
0
class SimplifiedSilhouette:
    def __init__(self, df, kmeans):
        self.kmeans = kmeans
        self.df = df
        self.clusters = self.kmeans.centroids
        self.distanceFinder = DistanceFinder()
        self.__distanceMatrix = self.__getDistanceMatrix()

    def __getDistanceMatrix(self):
        distanceMatrix = []
        for a in self.df.iloc:
            aux = []
            for cluster in self.clusters.values():
                aux.append(self.distanceFinder.euclidean(a.values, cluster))
            distanceMatrix.append(aux)
        return distanceMatrix

    def __get_a(self, row):
        cluster = self.kmeans.predict(self.df.iloc[row])
        return self.__distanceMatrix[row][cluster]

    def __get_b(self, row):
        cluster_id = self.kmeans.predict(self.df.iloc[row])
        minDist = 9999999999

        for cluster in range(len(self.__distanceMatrix[row])):
            if cluster != cluster_id:
                minDist = min(minDist, self.__distanceMatrix[row][cluster])

        return minDist

    def calculate(self):
        silhouette = []
        ss = 0

        for i in range(len(self.__distanceMatrix)):
            a = self.__get_a(i)
            b = self.__get_b(i)
            s_i = (b - a) / max(a, b)
            silhouette.append([a, b, s_i])
            ss += s_i

        return ss / len(silhouette)
Пример #6
0
class SingleLink:
    def __init__(self, df):
        self.distanceFinder = DistanceFinder()
        distances = self.distanceFinder.euclideanAll(df)
        self.distanceMatrix = self.__convertToDict(distances)
        # A simple mock test from pratica-09

        # self.distanceMatrix = {
        #     0: [0, 2, 6, 10, 9],
        #     1: [2, 0, 5, 9, 8],
        #     2: [6, 5, 0, 4, 5],
        #     3: [10, 9, 4, 0, 3],
        #     4: [9, 8, 5, 3, 0]
        # }

        self.dfDistance = pd.DataFrame(self.distanceMatrix)
        open("result.txt", "w").close()  # Clearing the result file

    def __combine(self, i, j):
        return str(i) + ', ' + str(j)

    def SingleLink(self):
        while len(self.dfDistance) > 2:
            # print(self.dfDistance, end="\n\n")
            self.__printDf()
            i, j = self.__posMinValue()  # Find cluster (i, j) to join
            newDistancesMatrix = pd.DataFrame()
            newDistancesMatrix[self.__combine(
                i, j
            )] = [  # Create a new distances matrix with new joined cluster
                0
            ] * (len(self.dfDistance) - 1)

            for column in self.dfDistance.columns:
                if column not in (i, j):
                    newDistancesMatrix[column] = [0] * (len(self.dfDistance) -
                                                        1)

            # Rename the rows with neu joined clusters
            newDistancesMatrix = self.__renameRowsDf(newDistancesMatrix)

            for k in newDistancesMatrix.columns:
                for k2 in newDistancesMatrix.columns:
                    if k == k2:
                        # If clusters are the same, add 0
                        newDistancesMatrix[k][k2] = 0
                    else:
                        if type(k) is str:
                            # Case where we need to find minDistance beetwen merged clusters, example: (i, j) = (1, 2), k = (1, 2) and k2 = 3 then minValue = min((1, 3), (2, 3))
                            if (i in eval(k)) or (j in eval(k)):
                                minV = min(self.dfDistance[i][k2],
                                           self.dfDistance[j][k2])
                                newDistancesMatrix[k][k2] = minV
                            # If current cluster is not in a joined cluster, just add the previous value
                            elif k in self.dfDistance.columns and k2 in self.dfDistance.columns:
                                newDistancesMatrix[k][k2] = self.dfDistance[k][
                                    k2]
                        if type(k2) is str:
                            if (i in eval(k2)) or (j in eval(k2)):
                                minV = min(self.dfDistance[i][k],
                                           self.dfDistance[j][k])
                                newDistancesMatrix[k][k2] = minV
                            # If current cluster is not in a joined cluster, just add the previous value
                            elif k in self.dfDistance.columns and k2 in self.dfDistance.columns:
                                newDistancesMatrix[k][k2] = self.dfDistance[k][
                                    k2]
                        elif type(k) is int and type(k2) is int:
                            # If current cluster is not in a joined cluster, just add the previous value
                            newDistancesMatrix[k][k2] = self.dfDistance[k][k2]

            # print(newDistancesMatrix)
            self.dfDistance = newDistancesMatrix

        self.__printDf()
        return self.dfDistance

    def __renameRowsDf(self, newDistancesMatrix):
        index = 0
        aux = {}
        for column in newDistancesMatrix.columns:
            aux[index] = column
            index += 1

        return newDistancesMatrix.rename(index=aux)

    def __posMinValue(self):
        minV = math.inf
        for i in self.dfDistance:
            for j in self.dfDistance[i].index:
                if i != j and self.dfDistance[i][j] < minV:
                    minV = self.dfDistance[i][j]
                    res = (i, j)

        return res

    def __convertToDict(self, matrix):
        return {idx: val for idx, val in enumerate(matrix)}

    def __printDf(self):
        tmp = ""
        for cluster in self.dfDistance.columns:
            tmp += "{ " + str(cluster) + " }, "
        print("Hierarchy: ", tmp, end="\n\n")
        with open("result.txt", "a") as file:
            file.write(tmp + "\n")
Пример #7
0
 def __init__(self, df, kmeans):
     self.kmeans = kmeans
     self.df = df
     self.clusters = self.kmeans.centroids
     self.distanceFinder = DistanceFinder()
     self.__distanceMatrix = self.__getDistanceMatrix()