def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          total_points=200,
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            ####################

            count += 1

    return table
def cigars_circles(num_experiments=10, run_times=5, kind='cigars'):
    table = []
    for i in range(num_experiments):
        this_experiment = []
        
        if kind == 'cigars':
            m1 = [0,0]
            m2 = [6.5,0]
            s1 = np.array([[1,0],[0,20]])
            s2 = np.array([[1,0],[0,20]])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [200, 200])
            k = 2
            init = 'k-means++'
        elif kind == '2circles':
            X, z = data.circles([1, 3], [0.2, 0.2], [400, 400])
            k = 2
            init = 'random'
        elif kind == '3circles':
            X, z = data.circles([1, 3, 5], [0.2, 0.2, 0.2], [400, 400, 400])
            init = 'random'
            k = 3
        else:
            raise ValueError("Don't know which example to sample.")

        #sigma = 2
        sigma = 1
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))

        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        #this_experiment.append(
        #    run_clustering.spectral(k,X,G_exp,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))
        
        table.append(this_experiment)
    
    table = np.array(table)
    for i in range(8):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def mnist(num_experiments=10, digits=[0,1,2], num_points=100, run_times=5):
    
    k = len(digits)
    f = gzip.open('experiments_data/mnist.pkl.gz', 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()
    images_test, labels_test = train_set
    images, labels = valid_set

    # using training test to compute sigma
    X_train, z_train = sample_digits(digits, images_test, labels_test, 
                                     num_points)
    n, _ = X_train.shape
    sigma = np.sqrt(sum([np.linalg.norm(X_train[i]-X_train[j])**2 
                         for i in range(n) for j in range(n)])/(n**2))
    print sigma
    print
    
    table = []
    init = 'k-means++'
    for i in range(num_experiments):
        this_experiment = []
        
        # now cluster on validation set
        X, z = sample_digits(digits, images, labels, num_points)
        
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))
        
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,
                                        run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init="k-means++",run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        # my gmm was breaking for some unknown reason
        #this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))

        table.append(this_experiment)

    table = np.array(table)
    for i in range(table[0,:].shape[0]):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def gauss_dimensions_cov(
        dimensions=range(2, 100, 20), total_points=200, num_experiments=100,
        d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D - d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([
                1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813,
                3.637
            ])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D - d))))
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 300
    table = np.zeros((num_experiments * len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d)))
            s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d))))
            pi1 = (N - p) / N / 2.
            pi2 = (N + p) / N / 2.
            n1, n2 = np.random.multinomial(N, [pi1, pi2])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200,
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            ####################
            
            count += 1

    return table
예제 #7
0
def normal_or_lognormal_difference(
        numpoints=range(10, 100, 10), num_experiments=100, kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5 * np.eye(D)
            m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5, 0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2],
                                                   [n1, n2])

            rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k,
                                                  X,
                                                  G,
                                                  z,
                                                  init="k-means++",
                                                  run_times=5)
            lloyd = run_clustering.energy_lloyd(k,
                                                X,
                                                G,
                                                z,
                                                init="k-means++",
                                                run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, run_times=5)
            this_res.append(hart - lloyd)
            this_res.append(hart - spectral)

            table.append(this_res)
    table = np.array(table)
    return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200,
                         num_experiments=100, d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D-d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([1.367,  3.175,  3.247,  4.403,  1.249,                                             1.969, 4.035,   4.237,  2.813,  3.637])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D-d))))
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            
            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                      run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 300
    table = np.zeros((num_experiments*len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d)))
            s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d))))
            pi1 = (N-p)/N/2.
            pi2 = (N+p)/N/2.
            n1, n2 = np.random.multinomial(N, [pi1,pi2])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z,
                                                run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z,
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z,
                                                init="kmeans", run_times=5)

            count += 1

    return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), 
                                   num_experiments=100,
                                   kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2])

            rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k, X, G, z, 
                                        init="k-means++", run_times=5)
            lloyd = run_clustering.energy_lloyd(k, X, G, z, 
                                        init="k-means++", run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, 
                                            run_times=5)
            this_res.append(hart-lloyd)
            this_res.append(hart-spectral)
            
            table.append(this_res)
    table = np.array(table)
    return table
예제 #11
0
# delete missing entries
delete_missing = np.where(data == '?')[0]
data = np.delete(data, delete_missing, axis=0)
data = np.array(data, dtype=float)
true_labels = np.delete(true_labels, delete_missing, axis=0)

# normalize data
data = (data - data.mean(axis=0)) / data.std(axis=0)

G = energy.eclust.kernel_matrix(data, rho)
#G = energy.eclust.kernel_matrix(data, rho_gauss)
#G = energy.eclust.kernel_matrix(data, rho_exp)

kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++")
gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans")
spectral_labels = cluster.spectral(6, data, G, run_times=10)
energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10)
lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral")
hart_labels = cluster.energy_hartigan(6,
                                      data,
                                      G,
                                      run_times=10,
                                      init="spectral")

t = PrettyTable([
    'Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure',
    'Fowlkes-Mallows'
])

algos = [
    'kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd',
# delete missing entries
delete_missing = np.where(data=='?')[0]
data = np.delete(data, delete_missing, axis=0)
data = np.array(data, dtype=float)
true_labels = np.delete(true_labels, delete_missing, axis=0)

# normalize data
data = (data - data.mean(axis=0))/data.std(axis=0)

G = energy.eclust.kernel_matrix(data, rho)
#G = energy.eclust.kernel_matrix(data, rho_gauss)
#G = energy.eclust.kernel_matrix(data, rho_exp)

kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++")
gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans")
spectral_labels = cluster.spectral(6, data, G, run_times=10)
energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10)
lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral")
hart_labels = cluster.energy_hartigan(6,data,G,run_times=10,init="spectral")

t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 
                'Fowlkes-Mallows'])

algos = ['kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd',
         'energy_hartigan']
pred_labels = [kmeans_labels, gmm_labels, spectral_labels,
               energy_spectral_labels, lloyd_labels, hart_labels]

for algo, pred_label in zip(algos, pred_labels):
    t.add_row([algo, 
        energy.metric.accuracy(true_labels, pred_label),