def populate(self): n_clusters = self.n_cluster # mapping the sample cluster result to idx for i in range(len(self.sampleIdx)): idx = self.m_sample_order_idx[i] self.idx[idx] = self.sampleIdx[i] # create an unsampleSet that removes all samples used in kmeans unSampleIdx = set(range(self.dataSize)) for sampleIdx in self.m_sample_order_idx: unSampleIdx.remove(sampleIdx) # Check how many available spots in a cluster availableSpot = [0 for _ in range(n_clusters)] clusterList = super(balanced_kmeans, self).getClusters() for i in range(n_clusters): availableSpot[i] = max(self.minVolume - len(clusterList[i]), 0) centroids = super(balanced_kmeans, self).getCentroids() for sampleIdx in unSampleIdx: instance = self.m_order[sampleIdx] minDistance = 1000000 clusterIdx = -1 for i in range(n_clusters): if availableSpot[i] > 0: centroid = centroids[i] if Calculator.calDistance(instance, centroid) < minDistance: minDistance = Calculator.calDistance( instance, centroid) clusterIdx = i if clusterIdx > -1: availableSpot[clusterIdx] -= 1 self.idx[sampleIdx] = clusterIdx else: for i in range(n_clusters): centroid = centroids[i] if Calculator.calDistance(instance, centroid) < minDistance: minDistance = Calculator.calDistance( instance, centroid) clusterIdx = i if clusterIdx > -1: availableSpot[clusterIdx] -= 1 self.idx[sampleIdx] = clusterIdx else: print("Warning")
def findNearestCentroid(self, instance, centroids): minDist = 99999999 for i in range(len(centroids)): distance = Calculator.calDistance(instance, centroids[i]) if minDist > distance: minDist = distance index = i return index
def execute(self): super(balanced_kmeans, self).execute() g = time.time() self.populate() self.refine() objValue = Calculator.calObjective(self.quantityTopic, self.quantityInvoice, self.w_location, self.d_location, self.c_location, self.idx) return objValue
def refine(self): ''' move a point from its current cluster to a nearer one guarrenting the balanced constraints ''' ''' Refinement: move points between clusters in order to low total distance input: m_orders: provide features of points clusterList: storage points for each clusters m_centroids: provide representatives for clusters output: clusterList: new clusters after refining ''' epsilon = 0.1 n_clusters = self.n_cluster m_orders = self.m_order minVolume = self.minVolume centroids = super(balanced_kmeans, self).getCentroids() clusterList = self.getClusters() while True: numCluster = [len(clusterList[i]) for i in range(n_clusters)] #print(numCluster) for clusterIdx in range(n_clusters): if numCluster[clusterIdx] > minVolume: for j in range(numCluster[clusterIdx]): if j >= len(clusterList[clusterIdx]): break instanceIdx = clusterList[clusterIdx][j] bestIndex = self.findNearestCentroid( m_orders[instanceIdx], centroids) if clusterIdx != bestIndex: self.idx[instanceIdx] = bestIndex #print('idx',instanceIdx,'original',clusterIdx,'after',bestIndex) numCluster[bestIndex] += 1 numCluster[clusterIdx] -= 1 if numCluster[clusterIdx] <= minVolume: break clusterList = self.getClusters() numCluster = [len(clusterList[i]) for i in range(n_clusters)] #print(numCluster) newCentroids = self.updateCentroids_Refine(m_orders, clusterList) print( 'Objective value after refining:', Calculator.calObjective(self.quantityTopic, self.quantityInvoice, self.w_location, self.d_location, self.c_location, self.idx)) #print('obj',self.total_obj()) if np.linalg.norm(centroids - newCentroids) <= epsilon: break centroids = newCentroids self.centroids = centroids
def assignCluster(self): ''' Assign each instance to a cluster :param None :return distortionValue ''' distortion = np.zeros(self.sampleSize) for i in range(self.sampleSize): min_distance = 99999 for k in range(self.n_cluster): d = Calculator.calDistance(self.m_sample_orders[i], self.centroids[k]) if d < min_distance: min_distance = d distortion[i] = min_distance**2 self.sampleIdx[i] = k distortionValue = sum(distortion) return distortionValue
def getObj(self): distortion = 0 for i in range(self.sampleSize): distortion += Calculator.calDistance( self.m_sample_orders[i], self.centroids[self.sampleIdx[i]]) return distortion
def total_obj(self): distortion = 0 for i in range(self.dataSize): distortion += Calculator.calDistance(self.m_order[i], self.centroids[self.idx[i]]) return distortion
dataSet = dataSet(fileName) m_order = dataSet.get_order() topicList = dataSet.get_topicList() d_location = dataSet.get_dLocation() w_location = dataSet.get_wLocation() c_location = dataSet.get_cLocation() nearWarehouse = dataSet.get_sortedDistanceIndex() skuKeeping = dataSet.get_skuKeeping() invoiceList = dataSet.get_quantity() _alpha = [i*0.01 for i in range(101)] for minVolume in [500,600,700,800]: _obj = [] for alpha in _alpha: Calculator.setAlpha(alpha) km = balanced_kmeans(m_order = m_order, quantityTopic = topicList, quantityInvoice = invoiceList, w_location = w_location, d_location = d_location, c_location = c_location, nearWarehouse = nearWarehouse, n_clusters = 20,minVolume = minVolume) objValue = km.execute() _obj.append(objValue) print('m =%d | α: %f Cost:%f' % (minVolume,alpha,objValue)) plt.plot(_alpha,_obj, label='m = %d' % minVolume) plt.xlim((0, 1)) plt.xlabel('α') plt.ylabel('Cost') plt.legend() plt.show() ''' clusterList = km.getClusters() centroidList = km.getCentroids() idx = km.getIdx()