def SLK_iterative(X, sigma, K, W, bound_=False, method='KM', C_init="kmeans_plus", **opt): """ Proposed SLK method with iterative mode updates """ start_time = timeit.default_timer() C, l = km_init(X, K, C_init) assert len(np.unique(l)) == K trivial_status = False N, D = X.shape z = [] bound_E = [] mode_index = [] oldE = 1e100 for i in range(100): oldC = C.copy() oldl = l.copy() oldmode_index = mode_index if method == 'MS': print('Inside meanshift update . ... ........................') for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ] = MS(X, sigma, tmp, C[[k], ], 1e-5, int(1e3)) sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'KM': mode_tmp = [] for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ], m = KM(X, tmp, sigma) mode_tmp.append(m) mode_index = mode_tmp[:] sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'Means': for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ] = X[tmp, :].mean(axis=0) mode_index = None a_p = ecdist(X, C, squared=True) elif method == 'BO' and i == 0: print('Inside SLK-BO') mode_tmp = [] for k in range(C.shape[0]): tmp = np.asarray(np.where(l == k)) if tmp.size != 1: tmp = tmp.squeeze() else: tmp = tmp[0] C[[k], ], m = KM(X, tmp, sigma) mode_tmp.append(m) mode_index = mode_tmp[:] sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method not in ['MS', 'BO', 'MS', 'Means']: print(' Error: Give appropriate method from MS/BO/Means') sys.exit(1) if bound_ == True: bound_lambda = opt['bound_lambda'] bound_iterations = opt['bound_iterations'] manual_parallel = False # False use auto numpy parallelization on BLAS/LAPACK/MKL if method == 'BO': l, C, mode_index, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, manual_parallel) else: l, _, _, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, manual_parallel) if (len(np.unique(l)) != K): print('not having some labels') trivial_status = True l = oldl.copy() C = oldC.copy() mode_index = oldmode_index break else: if method in ['BO', 'MS']: l = km_le(X, C, str('gp'), sigma) z = bound.get_S_discrete(l, N, K) else: l = km_le(X, C, None, None) z = bound.get_S_discrete(l, N, K) # Laplacian K-modes Energy # currentE = compute_energy_lapkmode(X,C,l,W,sigma,bound_lambda) # Discrete currentE = compute_energy_lapkmode_cont(X, C, z, W, sigma, bound_lambda, method=method) # continuous print('Laplacian K-mode Energy is = {:.5f}'.format(currentE)) # Convergence based on mode change # if np.linalg.norm(C-oldC,'fro') < tol*np.linalg.norm(oldC,'fro'): # print('......Job done......') # break # Convergence based on Laplacian K-modes Energy if (i > 1 and (abs(currentE - oldE) <= 1e-5 * abs(oldE))): print('......Job done......') break else: oldE = currentE.copy() elapsed = timeit.default_timer() - start_time print(elapsed) return C, l, elapsed, mode_index, z, bound_E, trivial_status
def SLK(X, sigma, K, W, bound_=False, method='MS', C_init="kmeans_plus", bound_lambda=1.0, **opt): """ Proposed SLK method with mode updates in parallel """ if bound_ == False: bound_lambda = 0.0 start_time = timeit.default_timer() print('Inside sigma = ' + repr(sigma)) C, l = km_init(X, K, C_init) assert len(np.unique(l)) == K N, D = X.shape mode_index = [] krange = list(range(K)) srange = [sigma] * K trivial_status = False z = [] bound_E = [] bound.init(X_s=X) bound.init(C_out=bound.new_shared_array([K, D], C.dtype)) oldE = 1e100 # pdb.set_trace() for i in range(100): oldC = C.copy() oldl = l.copy() oldmode_index = mode_index bound.init(C_s=bound.n2m(C)) bound.init(l_s=bound.n2m(l)) if K < 5: pool = multiprocessing.Pool(processes=K) else: pool = multiprocessing.Pool(processes=5) if method == 'MS': print('Inside meanshift update . ... ........................') pool.map(MS_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'KM': mode_index = pool.map(KM_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'BO': print('Inside SLK-BO') if i == 0: mode_index = pool.map(KM_par, zip(srange, krange)) _, C = bound.get_shared_arrays('l_s', 'C_out') sqdist = ecdist(X, C, squared=True) unary = np.exp((-sqdist) / (2 * sigma**2)) a_p = -unary elif method == 'Means': print('Inside k-means update') tmp_list = [np.where(l == k)[0] for k in range(K)] # pdb.set_trace() if bound_ == True and i > 0: C_list = [kmeans_update_soft(z[:, k]) for k in range(K)] else: C_list = pool.map(kmeans_update, tmp_list) C = np.asarray(np.vstack(C_list)) a_p = ecdist(X, C, squared=True) elif method not in ['MS', 'BO', 'MS', 'Means']: print(' Error: Give appropriate method from MS/BO/Means') sys.exit(1) pool.close() pool.join() pool.terminate() if bound_ == True: bound_iterations = opt['bound_iterations'] manual_parallel = False # False use auto numpy parallelization on BLAS/LAPACK/MKL batch = False if X.shape[0] > 100000: batch = True if method == 'BO': l, C, mode_index, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, batch, manual_parallel) else: l, _, _, z, bound_E = bound.bound_update( a_p, X, W, bound_lambda, bound_iterations, batch, manual_parallel) if (len(np.unique(l)) != K): print('not having some labels') trivial_status = True l = oldl.copy() C = oldC.copy() mode_index = oldmode_index break else: if method in ['BO', 'MS']: l = km_le(X, C, str('gp'), sigma) z = bound.get_S_discrete(l, N, K) else: l = km_le(X, C, None, None) z = bound.get_S_discrete(l, N, K) # Laplacian K-modes Energy # currentE = compute_energy_lapkmode(X,C,l,W,sigma,bound_lambda) # Discrete currentE = compute_energy_lapkmode_cont(X, C, z, W, sigma, bound_lambda, method=method) # continuous print('Laplacian K-mode Energy is = {:.5f}'.format(currentE)) # Convergence based on mode change # if np.linalg.norm(C-oldC,'fro') < 1e-4*np.linalg.norm(oldC,'fro'): # print('......Job done......') # break # Convergence based on Laplacian K-modes Energy if (i > 1 and (abs(currentE - oldE) <= 1e-5 * abs(oldE))): print('......Job done......') break else: oldE = currentE.copy() elapsed = timeit.default_timer() - start_time print(elapsed) return C, l, elapsed, mode_index, z, bound_E, trivial_status
def fair_clustering(X, K, u_V, V_list, lmbda, fairness=False, method='kmeans', C_init="kmeans_plus", l_init=None, A=None): """ Proposed fairness clustering method """ N, D = X.shape start_time = timeit.default_timer() C, l = km_init(X, K, C_init, l_init=l_init) assert len(np.unique(l)) == K ts = 0 S = [] E_org = [] E_cluster = [] E_fair = [] E_cluster_discrete = [] fairness_error = 0.0 oldE = 1e100 maxiter = 100 X_s = utils.init(X_s=X) pool = multiprocessing.Pool(processes=20) if A is not None: d = A.sum(axis=1) for i in range(maxiter): oldC = C.copy() oldl = l.copy() oldS = S.copy() if i == 0: if method == 'kmeans': sqdist = ecdist(X, C, squared=True) a_p = sqdist.copy() if method == 'kmedian': sqdist = ecdist(X, C) a_p = sqdist.copy() if method == 'ncut': S = get_S_discrete(l, N, K) sqdist_list = [ KernelBound_k(A, d, S[:, k], N) for k in range(K) ] sqdist = np.asarray(np.vstack(sqdist_list).T) a_p = sqdist.copy() elif method == 'kmeans': print('Inside k-means update') tmp_list = [np.where(l == k)[0] for k in range(K)] C_list = pool.map(kmeans_update, tmp_list) C = np.asarray(np.vstack(C_list)) sqdist = ecdist(X, C, squared=True) a_p = sqdist.copy() elif method == 'kmedian': print('Inside k-median update') tmp_list = [np.where(l == k)[0] for k in range(K)] C_list = pool.map(kmedian_update, tmp_list) C = np.asarray(np.vstack(C_list)) sqdist = ecdist(X, C) a_p = sqdist.copy() elif method == 'ncut': print('Inside ncut update') S = get_S_discrete(l, N, K) sqdist_list = [KernelBound_k(A, d, S[:, k], N) for k in range(K)] sqdist = np.asarray(np.vstack(sqdist_list).T) a_p = sqdist.copy() if fairness == True and lmbda != 0.0: l_check = a_p.argmin(axis=1) # Check for empty cluster if (len(np.unique(l_check)) != K): l, C, S, trivial_status = restore_nonempty_cluster( X, K, oldl, oldC, oldS, ts) ts = ts + 1 if trivial_status: break bound_iterations = 5000 l, S, bound_E = bound_update(a_p, u_V, V_list, lmbda, bound_iterations) fairness_error = get_fair_accuracy_proportional( u_V, V_list, l, N, K) print('fairness_error = {:0.4f}'.format(fairness_error)) else: if method == 'ncut': l = a_p.argmin(axis=1) S = get_S_discrete(l, N, K) else: S = get_S_discrete(l, N, K) l = km_le(X, C) currentE, clusterE, fairE, clusterE_discrete = compute_energy_fair_clustering( X, C, l, S, u_V, V_list, lmbda, A=A, method_cl=method) E_org.append(currentE) E_cluster.append(clusterE) E_fair.append(fairE) E_cluster_discrete.append(clusterE_discrete) if (len(np.unique(l)) != K) or math.isnan(fairness_error): l, C, S, trivial_status = restore_nonempty_cluster( X, K, oldl, oldC, oldS, ts) ts = ts + 1 if trivial_status: break if (i > 1 and (abs(currentE - oldE) <= 1e-4 * abs(oldE))): print('......Job done......') break else: oldE = currentE.copy() pool.close() pool.join() pool.terminate() elapsed = timeit.default_timer() - start_time print(elapsed) E = { 'fair_cluster_E': E_org, 'fair_E': E_fair, 'cluster_E': E_cluster, 'cluster_E_discrete': E_cluster_discrete } return C, l, elapsed, S, E
N = X.shape[0] V_list = [np.array(demograph == j) for j in np.unique(demograph)] V_sum = [x.sum() for x in V_list] J = len(V_sum) u_V = [x/N for x in V_sum] # N,D = X_org.shape # J = len(u_V) # # S = [] # # C = [] # # balance and Fairness error balance,_ = get_fair_accuracy(u_V,V_list,l,N,K) fairness_error = get_fair_accuracy_proportional(u_V,V_list,l,N,K) # # # # method_cl = 'ncut' S = get_S_discrete(l,N,K) filename = osp.join(data_dir,dataset+'_affinity_ncut_final.mat') # filename = osp.join(data_dir,dataset+'_affinity_ncut.mat') A = sio.loadmat(filename)['A'] # bound_lambda = 1 currentE, clusterE, fairE, clusterE_discrete = compute_energy_fair_clustering(X, [], l, S, u_V, V_list,0, A = A, method_cl=method_cl) # ## Ours cluster_option = 'kmeans' data_dir = 'data' dataset = 'Adult' output_path = 'outputs' savefile = osp.join(data_dir,'Fair_{}_fairness_vs_clusterEdiscrete_{}.npz'.format(cluster_option,dataset)) # plot_fairness_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, E_cluster_set) data = np.load(savefile)