def dbscan(self, dataSet): clusters = [] visited = set() noise = set() # Iterate over data points for i in range(len(dataSet)): point = dataSet[i] if point in visited: continue visited.add(point) N = [] minPtsNeighbours = 0 # check which point satisfies minPts condition for j in range(len(dataSet)): if i==j: continue pt = dataSet[j] dist = getEuclideanDist(point.x, point.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours += 1 N.append(pt) if minPtsNeighbours >= self.minPts: cluster = set() cluster.add(point) point.isAssignedToCluster = True j = 0 while j < len(N): point1 = N[j] minPtsNeighbours1 = 0 N1 = [] if not point1 in visited: visited.add(point1) for l in range(len(dataSet)): pt = dataSet[l] dist = getEuclideanDist(point1.x, point1.y, pt.x, pt.y) if dist <= self.e: minPtsNeighbours1 += 1 N1.append(pt) if minPtsNeighbours1 >= self.minPts: self.removeDuplicates(N, N1) # Add point1 is not yet member of any other cluster then add it to cluster # Hint: use self.isAssignedToCluster function to check if a point is assigned to any clusters # ========================# # STRART YOUR CODE HERE # # ========================# def isAssignedToCluster(point, clusters): for cluster in clusters: for pt in cluster: if pt.x == point.x and pt.y == point.y: return True return False if not isAssignedToCluster(point1, clusters): cluster.add(point1) # ========================# # END YOUR CODE HERE # # ========================# j += 1 # add cluster to the list of clusters clusters.append(cluster) else: noise.add(point) # List clusters print("Number of clusters formed :" + str(len(clusters))) print("Noise points :" + str(len(noise))) # Calculate purity compute_purity(clusters,len(self.dataSet)) compute_NMI(clusters,self.noOfLabels) DataPoints.writeToFile(noise, clusters, "DBSCAN_"+ self.dataname + ".csv")
def GMM(self): clusters = [] # [num_clusters,2] self.mean = [[0.0 for y in range(2)] for x in range(self.K)] # [num_clusters,2] self.stdDev = [[0.0 for y in range(2)] for x in range(self.K)] # [num_clusters,2] self.coVariance = [[[0.0 for z in range(2)] for y in range(2)] for x in range(self.K)] k = 0 while k < self.K: cluster = set() clusters.append(cluster) k += 1 # Initially randomly assign points to clusters i = 0 for point in self.dataSet: clusters[i % self.K].add(point) i += 1 # Initially assign equal prior weight for each cluster for m in range(self.K): self.w[m] = 1.0 / self.K # Get Initial mean, std, covariance matrix DataPoints.getMean(clusters, self.mean) DataPoints.getStdDeviation(clusters, self.mean, self.stdDev) DataPoints.getCovariance(clusters, self.mean, self.stdDev, self.coVariance) length = 0 while True: mle_old = self.Likelihood() self.Estep() self.Mstep() length += 1 mle_new = self.Likelihood() # convergence condition if abs(mle_new - mle_old) / abs(mle_old) < 0.000001: break print("Number of Iterations = " + str(length)) print("\nAfter Calculations") print("Final mean = ") self.printArray(self.mean) print("\nFinal covariance = ") self.print3D(self.coVariance) # Assign points to cluster depending on max prob. for j in range(self.K): clusters[j] = set() i = 0 for point in self.dataSet: index = -1 prob = 0.0 for j in range(self.K): if self.W[i][j] > prob: index = j prob = self.W[i][j] temp = clusters[index] temp.add(point) i += 1 # Calculate purity and NMI compute_purity(clusters, len(self.dataSet)) compute_NMI(clusters, self.K) # write clusters to file for plotting f = open("GMM_" + self.dataname + ".csv", "w") for w in range(self.K): print("Cluster " + str(w) + " size :" + str(len(clusters[w]))) for point in clusters[w]: f.write( str(point.x) + "," + str(point.y) + "," + str(w) + "\n") f.close()