def KM(X, tmp, s): """ Mode using definition m_l = \max(x_p in X) \sum_q k(x_p,x_q) """ tmp_size = tmp.size size_limit = 25000 # Decrease in case of memory error if tmp_size > size_limit: batch_size = 1024 Deg = [] num_batch = int(math.ceil(1.0 * tmp_size / batch_size)) for batch_idx in range(num_batch): start = batch_idx * batch_size end = min((batch_idx + 1) * batch_size, tmp_size) pairwise_dists = ecdist(X[tmp[start:end]], squared=True) W = np.exp(-pairwise_dists / (2 * (s**2))) np.fill_diagonal(W, 0) Deg_batch = np.sum(W, axis=1).tolist() Deg.append(Deg_batch) m = max(Deg) ind = Deg.index(m) mode_index = tmp[ind] c1 = X[[tmp[ind]], :] else: pairwise_dists = ecdist(X[tmp, :], squared=True) W = np.exp(-pairwise_dists / (2 * (s**2))) np.fill_diagonal(W, 0) Deg = np.sum(W, axis=1) ind = np.argmax(Deg) mode_index = tmp[ind] c1 = X[[tmp[ind]], :] return c1, mode_index
def estimate_sigma(X, W, knn, N): if N > 70000: batch_size = 4560 num_batch = int(math.ceil(1.0 * X.shape[0] / batch_size)) sigma_square = 0 for batch_A in range(num_batch): start1 = batch_A * batch_size end1 = min((batch_A + 1) * batch_size, N) for batch_B in range(num_batch): start2 = batch_B * batch_size end2 = min((batch_B + 1) * batch_size, N) print("start1 = %d|start2 = %d" % (start1, start2)) pairwise_dists = ecdist(X[start1:end1], X[start2:end2], squared=True) W_temp = W[start1:end1, :][:, start2:end2] sigma_square = sigma_square + ( W_temp.multiply(pairwise_dists)).sum() print(sigma_square) sigma_square = sigma_square / (knn * N) sigma = np.sqrt(sigma_square) else: pairwise_dists = ecdist(X, squared=True) sigma_square = W.multiply(pairwise_dists).sum() sigma_square = sigma_square / (knn * N) sigma = np.sqrt(sigma_square) return sigma
def KM_par(slices): """ Mode using definition m_l = \max(x_p in X) \sum_q k(x_p,x_q) in parallel for each cluster """ s, k = slices print('Inside parallel wth ' + repr(k) + 'and sigma ' + repr(s)) l, C_out = bound.get_shared_arrays('l_s', 'C_out') # X =np.memmap('X_MNIST_gan.dat',dtype='float32',mode='c',shape=(70000,256)) X = bound.SHARED_VARS['X_s'] tmp = np.asarray(np.where(l == k)) tmp_size = tmp.size if tmp_size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] # Using Gaussian Filtering # s =0.5 size_limit = 25000 if tmp_size > size_limit: batch_size = 1024 Deg = [] num_batch = int(math.ceil(1.0 * tmp_size / batch_size)) for batch_idx in range(num_batch): start = batch_idx * batch_size end = min((batch_idx + 1) * batch_size, tmp_size) pairwise_dists = ecdist(X[tmp[start:end]], squared=True) W = np.exp(-pairwise_dists / (2 * (s**2))) np.fill_diagonal(W, 0) Deg_batch = np.sum(W, axis=1).tolist() Deg.append(Deg_batch) m = max(Deg) ind = Deg.index(m) mode_index = tmp[ind] C_out[[k], :] = X[[tmp[ind]], :] else: pairwise_dists = ecdist(X[tmp, :], squared=True) W = np.exp(-pairwise_dists / (2 * (s**2))) np.fill_diagonal(W, 0) Deg = np.sum(W, axis=1) ind = np.argmax(Deg) mode_index = tmp[ind] C_out[[k], :] = X[[tmp[ind]], :] return mode_index
def compute_energy_fair_clustering(X, C, l, S, u_V, V_list, bound_lambda, A=None, method_cl='kmeans'): """ compute fair clustering energy """ print('compute energy') J = len(u_V) N, K = S.shape clustering_E_discrete = [] if method_cl == 'kmeans': e_dist = ecdist(X, C, squared=True) clustering_E = ne.evaluate('S*e_dist').sum() clustering_E_discrete = [ km_discrete_energy(e_dist, l, k) for k in range(K) ] clustering_E_discrete = sum(clustering_E_discrete) elif method_cl == 'ncut': clustering_E = NormalizedCutEnergy(A, S, l) clustering_E_discrete = NormalizedCutEnergy_discrete(A, l) elif method_cl == 'kmedian': e_dist = ecdist(X, C) clustering_E = ne.evaluate('S*e_dist').sum() clustering_E_discrete = [ km_discrete_energy(e_dist, l, k) for k in range(K) ] clustering_E_discrete = sum(clustering_E_discrete) # Fairness term fairness_E = [fairness_term_V_j(u_V[j], S, V_list[j]) for j in range(J)] fairness_E = (bound_lambda * sum(fairness_E)).sum() E = clustering_E + fairness_E print('fair clustering energy = {}'.format(E)) print('clustering energy = {}'.format(clustering_E_discrete)) return E, clustering_E, fairness_E, clustering_E_discrete
def compute_energy_lapkmode_cont(X, C, Z, W, sigma, bound_lambda, method='Means'): """ compute Laplacian K-modes energy """ e_dist = ecdist(X, C, squared=True) K = C.shape[0] if method == 'Means': clustering_E = (Z * e_dist).sum() elif method in ['KM', 'MS', 'BO']: g_dist = np.exp(-e_dist / (2 * sigma**2)) clustering_E = (-(Z * g_dist)).sum() E_lap = [Laplacian_term(W, Z[:, k]) for k in range(K)] E_lap = (bound_lambda * sum(E_lap)).sum() E = clustering_E - E_lap return E
def compute_energy_lapkmode(X, C, l, W, sigma, bound_lambda): """ compute Laplacian K-modes energy in discrete form """ e_dist = ecdist(X, C, squared=True) g_dist = np.exp(-e_dist / (2 * sigma**2)) pairwise = 0 Index_list = np.arange(X.shape[0]) for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] # print('length of tmp ', len(tmp)) # pairwise = pairwise - W[tmp,:][:,tmp].sum() # With potts values -1/0 nonmembers = np.in1d(Index_list, tmp, invert=True) # With potts values 0/1 pairwise = pairwise + W[tmp, :][:, nonmembers].sum() E_kmode = compute_km_energy(l, g_dist.T) print(E_kmode) print(pairwise) E = (bound_lambda) * pairwise + E_kmode return E
def km_le(X, M): """ Discretize the assignments based on center """ e_dist = ecdist(X, M) l = e_dist.argmin(axis=1) return l
def MS(X, s, tmp, c0, tol, maxit): """ Mean-shift iteration until convergence """ # print 'inside meanshift iterations.' for i in range(maxit): Y = ecdist(c0, X[tmp, :], squared=True) W = np.exp((-Y) / (2 * s**2)) c1 = np.dot(W, X[tmp, :]) / np.sum(W) if np.amax(np.absolute(c1 - c0)) < tol * np.amax(np.absolute(c0)): break else: c0 = c1.copy() return c1
def km_le(X, M, assign, sigma): """ Discretize the assignments based on center """ e_dist = ecdist(X, M) if assign == 'gp': g_dist = np.exp(-e_dist**2 / (2 * sigma**2)) l = g_dist.argmax(axis=1) energy = compute_km_energy(l, g_dist.T) print('Energy of Kmode = ' + repr(energy)) else: l = e_dist.argmin(axis=1) return l
def restore_nonempty_cluster(X, K, oldl, oldC, oldS, ts): ts_limit = 2 C_init = 'kmeans' if ts > ts_limit: print('not having some labels') trivial_status = True l = oldl.copy() C = oldC.copy() S = oldS.copy() else: print('try with new seeds') C, l = km_init(X, K, C_init) sqdist = ecdist(X, C, squared=True) S = normalize_2(np.exp((-sqdist))) trivial_status = False return l, C, S, trivial_status
def compute_energy_lapkmode_cont(X, C, Q, W, sigma, bound_lambda): """ compute Laplacian K-modes energy in discrete form """ e_dist = ecdist(X, C, squared=True) g_dist = np.exp(-e_dist / (2 * sigma**2)) pairwise = 0 Index_list = np.arange(X.shape[0]) for k in range(C.shape[0]): Z_k = Q[:, k] pairwise = pairwise + np.dot(np.transpose(Z_k), W.dot(Z_k)) E_kmode = (-(Q * g_dist)).sum() print(E_kmode) # E = E_kmode - (bound_lambda)*pairwise E_lap = (bound_lambda) * pairwise print(E_lap) E = E_kmode - E_lap return E
def SLK(X, sigma, K, W, bound_=False, method='MS', C_init="kmeans_plus", **opt): """ Proposed SLK method with mode updates in parallel """ start_time = timeit.default_timer() print('Inside sigma = ' + repr(sigma)) C, l = km_init(X, K, C_init) assert len(np.unique(l)) == K D = C.shape[1] mode_index = [] tol = 1e-3 krange = list(range(K)) srange = [sigma] * K trivial_status = False z = [] bound_E = [] bound.init(X_s=X) bound.init(C_out=bound.new_shared_array([K, D], C.dtype)) for i in range(100): oldC = C.copy() oldl = l.copy() oldmode_index = mode_index bound.init(C_s=bound.n2m(C)) bound.init(l_s=bound.n2m(l)) if K < 5: pool = multiprocessing.Pool(processes=K) else: pool = multiprocessing.Pool(processes=5) if method == 'MS': print('Inside meanshift update . ... ........................') pool.map(MS_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') elif method == 'KM': mode_index = pool.map(KM_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') elif method == 'SLK-BO' and i == 0: print('Inside SLK-BO') mode_index = pool.map(KM_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') elif method not in ['SLK-MS', 'SLK-BO', 'MS']: print(' Error: Give appropriate method from SLK-MS/SLK-BO') sys.exit(1) pool.close() pool.join() pool.terminate() if bound_ == True: bound_lambda = opt['bound_lambda'] bound_iterations = opt['bound_iterations'] manual_parallel = False # False use auto numpy parallelization on BLAS/LAPACK/MKL sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) batch = False if X.shape[0] > 100000: batch = True if method == 'SLK-BO': l, C, mode_index, z, bound_E = bound.bound_update( -unary, X, W, bound_lambda, bound_iterations, batch, manual_parallel) else: l, _, _, z, bound_E = bound.bound_update( -unary, X, W, bound_lambda, bound_iterations, batch, manual_parallel) if (len(np.unique(l)) != K): print('not having some labels') trivial_status = True l = oldl.copy() C = oldC.copy() mode_index = oldmode_index break else: l = km_le(X, C, str('gp'), sigma) if np.linalg.norm(C - oldC, 'fro') < tol * np.linalg.norm(oldC, 'fro'): print('......Job done......') break elapsed = timeit.default_timer() - start_time print(elapsed) return C, l, elapsed, mode_index, z, bound_E, trivial_status
def SLK_iterative(X, sigma, K, W, bound_=False, method='KM', C_init="kmeans_plus", **opt): """ Proposed SLK method with iterative mode updates """ start_time = timeit.default_timer() C, l = km_init(X, K, C_init) assert len(np.unique(l)) == K trivial_status = False N, D = X.shape z = [] bound_E = [] mode_index = [] oldE = 1e100 for i in range(100): oldC = C.copy() oldl = l.copy() oldmode_index = mode_index if method == 'MS': print('Inside meanshift update . ... ........................') for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ] = MS(X, sigma, tmp, C[[k], ], 1e-5, int(1e3)) sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'KM': mode_tmp = [] for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ], m = KM(X, tmp, sigma) mode_tmp.append(m) mode_index = mode_tmp[:] sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'Means': for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ] = X[tmp, :].mean(axis=0) mode_index = None a_p = ecdist(X, C, squared=True) elif method == 'BO' and i == 0: print('Inside SLK-BO') mode_tmp = [] for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ], m = KM(X, tmp, sigma) mode_tmp.append(m) mode_index = mode_tmp[:] sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method not in ['MS', 'BO', 'MS', 'Means']: print(' Error: Give appropriate method from MS/BO/Means') sys.exit(1) if bound_ == True: bound_lambda = opt['bound_lambda'] bound_iterations = opt['bound_iterations'] manual_parallel = False # False use auto numpy parallelization on BLAS/LAPACK/MKL if method == 'BO': l, C, mode_index, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, manual_parallel) else: l, _, _, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, manual_parallel) if (len(np.unique(l)) != K): print('not having some labels') trivial_status = True l = oldl.copy() C = oldC.copy() mode_index = oldmode_index break else: if method in ['BO', 'MS']: l = km_le(X, C, str('gp'), sigma) z = bound.get_S_discrete(l, N, K) else: l = km_le(X, C, None, None) z = bound.get_S_discrete(l, N, K) # Laplacian K-modes Energy # currentE = compute_energy_lapkmode(X,C,l,W,sigma,bound_lambda) # Discrete currentE = compute_energy_lapkmode_cont(X, C, z, W, sigma, bound_lambda, method=method) # continuous print('Laplacian K-mode Energy is = {:.5f}'.format(currentE)) # Convergence based on mode change # if np.linalg.norm(C-oldC,'fro') < tol*np.linalg.norm(oldC,'fro'): # print('......Job done......') # break # Convergence based on Laplacian K-modes Energy if (i > 1 and (abs(currentE - oldE) <= 1e-5 * abs(oldE))): print('......Job done......') break else: oldE = currentE.copy() elapsed = timeit.default_timer() - start_time print(elapsed) return C, l, elapsed, mode_index, z, bound_E, trivial_status
def fair_clustering(X, K, u_V, V_list, lmbda, fairness=False, method='kmeans', C_init="kmeans_plus", l_init=None, A=None): """ Proposed fairness clustering method """ N, D = X.shape start_time = timeit.default_timer() C, l = km_init(X, K, C_init, l_init=l_init) assert len(np.unique(l)) == K ts = 0 S = [] E_org = [] E_cluster = [] E_fair = [] E_cluster_discrete = [] fairness_error = 0.0 oldE = 1e100 maxiter = 100 X_s = utils.init(X_s=X) pool = multiprocessing.Pool(processes=20) if A is not None: d = A.sum(axis=1) for i in range(maxiter): oldC = C.copy() oldl = l.copy() oldS = S.copy() if i == 0: if method == 'kmeans': sqdist = ecdist(X, C, squared=True) a_p = sqdist.copy() if method == 'kmedian': sqdist = ecdist(X, C) a_p = sqdist.copy() if method == 'ncut': S = get_S_discrete(l, N, K) sqdist_list = [ KernelBound_k(A, d, S[:, k], N) for k in range(K) ] sqdist = np.asarray(np.vstack(sqdist_list).T) a_p = sqdist.copy() elif method == 'kmeans': print('Inside k-means update') tmp_list = [np.where(l == k)[0] for k in range(K)] C_list = pool.map(kmeans_update, tmp_list) C = np.asarray(np.vstack(C_list)) sqdist = ecdist(X, C, squared=True) a_p = sqdist.copy() elif method == 'kmedian': print('Inside k-median update') tmp_list = [np.where(l == k)[0] for k in range(K)] C_list = pool.map(kmedian_update, tmp_list) C = np.asarray(np.vstack(C_list)) sqdist = ecdist(X, C) a_p = sqdist.copy() elif method == 'ncut': print('Inside ncut update') S = get_S_discrete(l, N, K) sqdist_list = [KernelBound_k(A, d, S[:, k], N) for k in range(K)] sqdist = np.asarray(np.vstack(sqdist_list).T) a_p = sqdist.copy() if fairness == True and lmbda != 0.0: l_check = a_p.argmin(axis=1) # Check for empty cluster if (len(np.unique(l_check)) != K): l, C, S, trivial_status = restore_nonempty_cluster( X, K, oldl, oldC, oldS, ts) ts = ts + 1 if trivial_status: break bound_iterations = 5000 l, S, bound_E = bound_update(a_p, u_V, V_list, lmbda, bound_iterations) fairness_error = get_fair_accuracy_proportional( u_V, V_list, l, N, K) print('fairness_error = {:0.4f}'.format(fairness_error)) else: if method == 'ncut': l = a_p.argmin(axis=1) S = get_S_discrete(l, N, K) else: S = get_S_discrete(l, N, K) l = km_le(X, C) currentE, clusterE, fairE, clusterE_discrete = compute_energy_fair_clustering( X, C, l, S, u_V, V_list, lmbda, A=A, method_cl=method) E_org.append(currentE) E_cluster.append(clusterE) E_fair.append(fairE) E_cluster_discrete.append(clusterE_discrete) if (len(np.unique(l)) != K) or math.isnan(fairness_error): l, C, S, trivial_status = restore_nonempty_cluster( X, K, oldl, oldC, oldS, ts) ts = ts + 1 if trivial_status: break if (i > 1 and (abs(currentE - oldE) <= 1e-4 * abs(oldE))): print('......Job done......') break else: oldE = currentE.copy() pool.close() pool.join() pool.terminate() elapsed = timeit.default_timer() - start_time print(elapsed) E = { 'fair_cluster_E': E_org, 'fair_E': E_fair, 'cluster_E': E_cluster, 'cluster_E_discrete': E_cluster_discrete } return C, l, elapsed, S, E
def SLK(X, sigma, K, W, bound_=False, method='MS', C_init="kmeans_plus", bound_lambda=1.0, **opt): """ Proposed SLK method with mode updates in parallel """ if bound_ == False: bound_lambda = 0.0 start_time = timeit.default_timer() print('Inside sigma = ' + repr(sigma)) C, l = km_init(X, K, C_init) assert len(np.unique(l)) == K N, D = X.shape mode_index = [] krange = list(range(K)) srange = [sigma] * K trivial_status = False z = [] bound_E = [] bound.init(X_s=X) bound.init(C_out=bound.new_shared_array([K, D], C.dtype)) oldE = 1e100 # pdb.set_trace() for i in range(100): oldC = C.copy() oldl = l.copy() oldmode_index = mode_index bound.init(C_s=bound.n2m(C)) bound.init(l_s=bound.n2m(l)) if K < 5: pool = multiprocessing.Pool(processes=K) else: pool = multiprocessing.Pool(processes=5) if method == 'MS': print('Inside meanshift update . ... ........................') pool.map(MS_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'KM': mode_index = pool.map(KM_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'BO': print('Inside SLK-BO') if i == 0: mode_index = pool.map(KM_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'Means': print('Inside k-means update') tmp_list = [np.where(l == k)[0] for k in range(K)] # pdb.set_trace() if bound_ == True and i > 0: C_list = [kmeans_update_soft(z[:, k]) for k in range(K)] else: C_list = pool.map(kmeans_update, tmp_list) C = np.asarray(np.vstack(C_list)) a_p = ecdist(X, C, squared=True) elif method not in ['MS', 'BO', 'MS', 'Means']: print(' Error: Give appropriate method from MS/BO/Means') sys.exit(1) pool.close() pool.join() pool.terminate() if bound_ == True: bound_iterations = opt['bound_iterations'] manual_parallel = False # False use auto numpy parallelization on BLAS/LAPACK/MKL batch = False if X.shape[0] > 100000: batch = True if method == 'BO': l, C, mode_index, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, batch, manual_parallel) else: l, _, _, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, batch, manual_parallel) if (len(np.unique(l)) != K): print('not having some labels') trivial_status = True l = oldl.copy() C = oldC.copy() mode_index = oldmode_index break else: if method in ['BO', 'MS']: l = km_le(X, C, str('gp'), sigma) z = bound.get_S_discrete(l, N, K) else: l = km_le(X, C, None, None) z = bound.get_S_discrete(l, N, K) # Laplacian K-modes Energy # currentE = compute_energy_lapkmode(X,C,l,W,sigma,bound_lambda) # Discrete currentE = compute_energy_lapkmode_cont(X, C, z, W, sigma, bound_lambda, method=method) # continuous print('Laplacian K-mode Energy is = {:.5f}'.format(currentE)) # Convergence based on mode change # if np.linalg.norm(C-oldC,'fro') < 1e-4*np.linalg.norm(oldC,'fro'): # print('......Job done......') # break # Convergence based on Laplacian K-modes Energy if (i > 1 and (abs(currentE - oldE) <= 1e-5 * abs(oldE))): print('......Job done......') break else: oldE = currentE.copy() elapsed = timeit.default_timer() - start_time print(elapsed) return C, l, elapsed, mode_index, z, bound_E, trivial_status