예제 #1
0
def SLK_iterative(X,
                  sigma,
                  K,
                  W,
                  bound_=False,
                  method='KM',
                  C_init="kmeans_plus",
                  **opt):
    """ 
    Proposed SLK method with iterative mode updates
    
    """

    start_time = timeit.default_timer()
    C, l = km_init(X, K, C_init)
    assert len(np.unique(l)) == K
    trivial_status = False
    N, D = X.shape
    z = []
    bound_E = []
    mode_index = []
    oldE = 1e100
    for i in range(100):
        oldC = C.copy()
        oldl = l.copy()
        oldmode_index = mode_index
        if method == 'MS':
            print('Inside meanshift update . ... ........................')
            for k in range(C.shape[0]):
                tmp = np.asarray(np.where(l == k))
                if tmp.size != 1:
                    tmp = tmp.squeeze()
                else:
                    tmp = tmp[0]
                C[[k], ] = MS(X, sigma, tmp, C[[k], ], 1e-5, int(1e3))

            sqdist = ecdist(X, C, squared=True)
            unary = np.exp((-sqdist) / (2 * sigma**2))
            a_p = -unary
        elif method == 'KM':
            mode_tmp = []
            for k in range(C.shape[0]):
                tmp = np.asarray(np.where(l == k))
                if tmp.size != 1:
                    tmp = tmp.squeeze()
                else:
                    tmp = tmp[0]
                C[[k], ], m = KM(X, tmp, sigma)
                mode_tmp.append(m)
            mode_index = mode_tmp[:]
            sqdist = ecdist(X, C, squared=True)
            unary = np.exp((-sqdist) / (2 * sigma**2))
            a_p = -unary

        elif method == 'Means':
            for k in range(C.shape[0]):
                tmp = np.asarray(np.where(l == k))
                if tmp.size != 1:
                    tmp = tmp.squeeze()
                else:
                    tmp = tmp[0]
                C[[k], ] = X[tmp, :].mean(axis=0)
            mode_index = None
            a_p = ecdist(X, C, squared=True)

        elif method == 'BO' and i == 0:
            print('Inside SLK-BO')
            mode_tmp = []
            for k in range(C.shape[0]):
                tmp = np.asarray(np.where(l == k))
                if tmp.size != 1:
                    tmp = tmp.squeeze()
                else:
                    tmp = tmp[0]
                C[[k], ], m = KM(X, tmp, sigma)
                mode_tmp.append(m)
            mode_index = mode_tmp[:]
            sqdist = ecdist(X, C, squared=True)
            unary = np.exp((-sqdist) / (2 * sigma**2))
            a_p = -unary

        elif method not in ['MS', 'BO', 'MS', 'Means']:
            print(' Error: Give appropriate method from MS/BO/Means')
            sys.exit(1)

        if bound_ == True:
            bound_lambda = opt['bound_lambda']
            bound_iterations = opt['bound_iterations']
            manual_parallel = False  # False use auto numpy parallelization on BLAS/LAPACK/MKL
            if method == 'BO':
                l, C, mode_index, z, bound_E = bound.bound_update(
                    a_p, X, W, bound_lambda, bound_iterations, manual_parallel)
            else:
                l, _, _, z, bound_E = bound.bound_update(
                    a_p, X, W, bound_lambda, bound_iterations, manual_parallel)

            if (len(np.unique(l)) != K):
                print('not having some labels')
                trivial_status = True
                l = oldl.copy()
                C = oldC.copy()
                mode_index = oldmode_index
                break

        else:
            if method in ['BO', 'MS']:
                l = km_le(X, C, str('gp'), sigma)
                z = bound.get_S_discrete(l, N, K)
            else:
                l = km_le(X, C, None, None)
                z = bound.get_S_discrete(l, N, K)

        # Laplacian K-modes Energy

        # currentE = compute_energy_lapkmode(X,C,l,W,sigma,bound_lambda)  # Discrete
        currentE = compute_energy_lapkmode_cont(X,
                                                C,
                                                z,
                                                W,
                                                sigma,
                                                bound_lambda,
                                                method=method)  # continuous
        print('Laplacian K-mode Energy is = {:.5f}'.format(currentE))

        # Convergence based on mode change
        # if np.linalg.norm(C-oldC,'fro') < tol*np.linalg.norm(oldC,'fro'):
        #   print('......Job  done......')
        #   break

        # Convergence based on Laplacian K-modes Energy
        if (i > 1 and (abs(currentE - oldE) <= 1e-5 * abs(oldE))):
            print('......Job  done......')
            break

        else:
            oldE = currentE.copy()

    elapsed = timeit.default_timer() - start_time
    print(elapsed)
    return C, l, elapsed, mode_index, z, bound_E, trivial_status
예제 #2
0
def SLK(X,
        sigma,
        K,
        W,
        bound_=False,
        method='MS',
        C_init="kmeans_plus",
        bound_lambda=1.0,
        **opt):
    """
    Proposed SLK method with mode updates in parallel

    """
    if bound_ == False:
        bound_lambda = 0.0

    start_time = timeit.default_timer()
    print('Inside sigma = ' + repr(sigma))
    C, l = km_init(X, K, C_init)
    assert len(np.unique(l)) == K
    N, D = X.shape
    mode_index = []
    krange = list(range(K))
    srange = [sigma] * K
    trivial_status = False
    z = []
    bound_E = []
    bound.init(X_s=X)
    bound.init(C_out=bound.new_shared_array([K, D], C.dtype))
    oldE = 1e100
    # pdb.set_trace()
    for i in range(100):
        oldC = C.copy()
        oldl = l.copy()
        oldmode_index = mode_index
        bound.init(C_s=bound.n2m(C))
        bound.init(l_s=bound.n2m(l))

        if K < 5:
            pool = multiprocessing.Pool(processes=K)
        else:
            pool = multiprocessing.Pool(processes=5)

        if method == 'MS':
            print('Inside meanshift update . ... ........................')
            pool.map(MS_par, zip(srange, krange))
            _, C = bound.get_shared_arrays('l_s', 'C_out')
            sqdist = ecdist(X, C, squared=True)
            unary = np.exp((-sqdist) / (2 * sigma**2))
            a_p = -unary

        elif method == 'KM':
            mode_index = pool.map(KM_par, zip(srange, krange))
            _, C = bound.get_shared_arrays('l_s', 'C_out')
            sqdist = ecdist(X, C, squared=True)
            unary = np.exp((-sqdist) / (2 * sigma**2))
            a_p = -unary

        elif method == 'BO':
            print('Inside SLK-BO')
            if i == 0:
                mode_index = pool.map(KM_par, zip(srange, krange))
                _, C = bound.get_shared_arrays('l_s', 'C_out')
            sqdist = ecdist(X, C, squared=True)
            unary = np.exp((-sqdist) / (2 * sigma**2))
            a_p = -unary

        elif method == 'Means':
            print('Inside k-means update')
            tmp_list = [np.where(l == k)[0] for k in range(K)]
            # pdb.set_trace()
            if bound_ == True and i > 0:

                C_list = [kmeans_update_soft(z[:, k]) for k in range(K)]
            else:
                C_list = pool.map(kmeans_update, tmp_list)

            C = np.asarray(np.vstack(C_list))
            a_p = ecdist(X, C, squared=True)

        elif method not in ['MS', 'BO', 'MS', 'Means']:
            print(' Error: Give appropriate method from MS/BO/Means')
            sys.exit(1)

        pool.close()
        pool.join()
        pool.terminate()

        if bound_ == True:
            bound_iterations = opt['bound_iterations']
            manual_parallel = False  # False use auto numpy parallelization on BLAS/LAPACK/MKL
            batch = False
            if X.shape[0] > 100000:
                batch = True
            if method == 'BO':
                l, C, mode_index, z, bound_E = bound.bound_update(
                    a_p, X, W, bound_lambda, bound_iterations, batch,
                    manual_parallel)
            else:
                l, _, _, z, bound_E = bound.bound_update(
                    a_p, X, W, bound_lambda, bound_iterations, batch,
                    manual_parallel)

            if (len(np.unique(l)) != K):
                print('not having some labels')
                trivial_status = True
                l = oldl.copy()
                C = oldC.copy()
                mode_index = oldmode_index
                break

        else:
            if method in ['BO', 'MS']:
                l = km_le(X, C, str('gp'), sigma)
                z = bound.get_S_discrete(l, N, K)

            else:
                l = km_le(X, C, None, None)
                z = bound.get_S_discrete(l, N, K)

        # Laplacian K-modes Energy

        # currentE = compute_energy_lapkmode(X,C,l,W,sigma,bound_lambda)  # Discrete
        currentE = compute_energy_lapkmode_cont(X,
                                                C,
                                                z,
                                                W,
                                                sigma,
                                                bound_lambda,
                                                method=method)  # continuous
        print('Laplacian K-mode Energy is = {:.5f}'.format(currentE))

        # Convergence based on mode change
        # if np.linalg.norm(C-oldC,'fro') < 1e-4*np.linalg.norm(oldC,'fro'):
        #   print('......Job  done......')
        #   break

        # Convergence based on Laplacian K-modes Energy
        if (i > 1 and (abs(currentE - oldE) <= 1e-5 * abs(oldE))):
            print('......Job  done......')
            break

        else:
            oldE = currentE.copy()

    elapsed = timeit.default_timer() - start_time
    print(elapsed)
    return C, l, elapsed, mode_index, z, bound_E, trivial_status
예제 #3
0
def fair_clustering(X,
                    K,
                    u_V,
                    V_list,
                    lmbda,
                    fairness=False,
                    method='kmeans',
                    C_init="kmeans_plus",
                    l_init=None,
                    A=None):
    """ 
    
    Proposed fairness clustering method
    
    """
    N, D = X.shape
    start_time = timeit.default_timer()
    C, l = km_init(X, K, C_init, l_init=l_init)
    assert len(np.unique(l)) == K
    ts = 0
    S = []
    E_org = []
    E_cluster = []
    E_fair = []
    E_cluster_discrete = []
    fairness_error = 0.0
    oldE = 1e100

    maxiter = 100
    X_s = utils.init(X_s=X)
    pool = multiprocessing.Pool(processes=20)
    if A is not None:
        d = A.sum(axis=1)

    for i in range(maxiter):
        oldC = C.copy()
        oldl = l.copy()
        oldS = S.copy()

        if i == 0:
            if method == 'kmeans':
                sqdist = ecdist(X, C, squared=True)
                a_p = sqdist.copy()

            if method == 'kmedian':
                sqdist = ecdist(X, C)
                a_p = sqdist.copy()

            if method == 'ncut':
                S = get_S_discrete(l, N, K)
                sqdist_list = [
                    KernelBound_k(A, d, S[:, k], N) for k in range(K)
                ]
                sqdist = np.asarray(np.vstack(sqdist_list).T)
                a_p = sqdist.copy()

        elif method == 'kmeans':

            print('Inside k-means update')
            tmp_list = [np.where(l == k)[0] for k in range(K)]
            C_list = pool.map(kmeans_update, tmp_list)
            C = np.asarray(np.vstack(C_list))
            sqdist = ecdist(X, C, squared=True)
            a_p = sqdist.copy()

        elif method == 'kmedian':

            print('Inside k-median update')
            tmp_list = [np.where(l == k)[0] for k in range(K)]
            C_list = pool.map(kmedian_update, tmp_list)
            C = np.asarray(np.vstack(C_list))
            sqdist = ecdist(X, C)
            a_p = sqdist.copy()

        elif method == 'ncut':
            print('Inside ncut update')
            S = get_S_discrete(l, N, K)
            sqdist_list = [KernelBound_k(A, d, S[:, k], N) for k in range(K)]
            sqdist = np.asarray(np.vstack(sqdist_list).T)
            a_p = sqdist.copy()

        if fairness == True and lmbda != 0.0:

            l_check = a_p.argmin(axis=1)

            # Check for empty cluster
            if (len(np.unique(l_check)) != K):
                l, C, S, trivial_status = restore_nonempty_cluster(
                    X, K, oldl, oldC, oldS, ts)
                ts = ts + 1
                if trivial_status:
                    break

            bound_iterations = 5000

            l, S, bound_E = bound_update(a_p, u_V, V_list, lmbda,
                                         bound_iterations)
            fairness_error = get_fair_accuracy_proportional(
                u_V, V_list, l, N, K)
            print('fairness_error = {:0.4f}'.format(fairness_error))

        else:

            if method == 'ncut':
                l = a_p.argmin(axis=1)
                S = get_S_discrete(l, N, K)

            else:
                S = get_S_discrete(l, N, K)
                l = km_le(X, C)

        currentE, clusterE, fairE, clusterE_discrete = compute_energy_fair_clustering(
            X, C, l, S, u_V, V_list, lmbda, A=A, method_cl=method)
        E_org.append(currentE)
        E_cluster.append(clusterE)
        E_fair.append(fairE)
        E_cluster_discrete.append(clusterE_discrete)

        if (len(np.unique(l)) != K) or math.isnan(fairness_error):
            l, C, S, trivial_status = restore_nonempty_cluster(
                X, K, oldl, oldC, oldS, ts)
            ts = ts + 1
            if trivial_status:
                break

        if (i > 1 and (abs(currentE - oldE) <= 1e-4 * abs(oldE))):
            print('......Job  done......')
            break

        else:
            oldE = currentE.copy()

    pool.close()
    pool.join()
    pool.terminate()
    elapsed = timeit.default_timer() - start_time
    print(elapsed)
    E = {
        'fair_cluster_E': E_org,
        'fair_E': E_fair,
        'cluster_E': E_cluster,
        'cluster_E_discrete': E_cluster_discrete
    }
    return C, l, elapsed, S, E
N  = X.shape[0]
V_list =  [np.array(demograph == j) for j in np.unique(demograph)]
V_sum =  [x.sum() for x in V_list]
J = len(V_sum)
u_V = [x/N for x in V_sum]
# N,D = X_org.shape
# J = len(u_V)
# # S = []
# # C = []
# # balance and Fairness error
balance,_ = get_fair_accuracy(u_V,V_list,l,N,K)
fairness_error = get_fair_accuracy_proportional(u_V,V_list,l,N,K)
# #
# #
method_cl = 'ncut'
S = get_S_discrete(l,N,K)
filename = osp.join(data_dir,dataset+'_affinity_ncut_final.mat')
# filename = osp.join(data_dir,dataset+'_affinity_ncut.mat')
A = sio.loadmat(filename)['A']
# bound_lambda = 1
currentE, clusterE, fairE, clusterE_discrete = compute_energy_fair_clustering(X, [], l, S, u_V, V_list,0, A = A, method_cl=method_cl)
#

## Ours
cluster_option = 'kmeans'
data_dir = 'data'
dataset = 'Adult'
output_path = 'outputs'
savefile = osp.join(data_dir,'Fair_{}_fairness_vs_clusterEdiscrete_{}.npz'.format(cluster_option,dataset))
# plot_fairness_vs_clusterE(cluster_option, savefile, filename, lmbdas, fairness_error_set, E_cluster_set)
data = np.load(savefile)