Пример #1
0
def experiments(data, labellist, labelidxs, num_clusters):
    def find_closest_point(data, Y, labellist, labelidx):
        err = []
        for x in data:
            r = x.shape[1]
            err.append(np.sqrt(r - np.trace(Y.T @ x @ x.T @ Y)))
        idx = np.argmin(err)

        closest_label = [labellist[ii] for ii in np.where(labelidxs == idx)[0]]

        return closest_label

    p_correct = {'Flag Median': 0, 'Sine Median': 0, 'Max Cosine': 0}

    clusters = np.random.randint(0, num_clusters, len(data))

    for ii in range(num_clusters):
        n_its = 30

        idx = np.where(clusters == ii)[0]
        print(len(idx))
        X = [data[i] for i in idx]
        labels = [labellist[i] for i in idx]
        labelid = [labelidxs[i] for i in idx]

        most_common = max(set(labels), key=labels.count)  #from stack exchange

        k = X[0].shape[1]

        flagmean = ca.flag_mean(X, k, fast=False)
        print('Flag Mean finished')

        sin_median = ca.irls_flag(X, k, n_its, 'sine', fast=False)[0]
        print('Sine Median finished')

        max_cosine = ca.irls_flag(X, k, n_its, 'cosine', fast=False)[0]
        print('Max Cos finished')

        p_correct['Flag Median'] += int(most_common in find_closest_point(
            data, flagmean, labellist, labelidxs))
        p_correct['Sine Median'] += int(most_common in find_closest_point(
            data, sin_median, labellist, labelidxs))
        p_correct['Max Cosine'] += int(most_common in find_closest_point(
            data, max_cosine, labellist, labelidxs))

        print('Iteration ' + str(ii + 1) + ' out of ' + str(num_clusters) +
              ' finished')
        print('--------------------------------------------')

    p_correct['Flag Median'] = p_correct['Flag Median'] / num_clusters
    p_correct['Sine Median'] = p_correct['Sine Median'] / num_clusters
    p_correct['Max Cosine'] = p_correct['Max Cosine'] / num_clusters

    return p_correct
Пример #2
0
def getCentroids(dataSet, labels, centroids, med):
    # Each centroid is the geometric mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
    [n, r] = dataSet[0].shape

    new_centroids = []
    for ii in range(len(centroids)):
        idx = np.where(np.array(labels) == ii)[0]
        # if idx.size == 0:
        # 	centroids[ii] = getRandomCentorids(1,n,r)[0]
        if len(idx) != 0:
            X = [dataSet[i] for i in idx]
            if med == 'flag':
                new_centroids.append(ca.flag_mean(X, r, fast=False))
            elif med == 'sine':
                new_centroids.append(ca.irls_flag(X, r, 5, 'sine')[0])
            elif med == 'cosine':
                new_centroids.append(ca.irls_flag(X, r, 5, 'cosine')[0])

    return new_centroids
Пример #3
0
        center = np.random.rand(n,k)*10
        center_rep = np.linalg.qr(center)[0][:,:k]

        #generate dataset of points in Gr(k,n)
        data = []
        for i in range(num_points):
            Y_raw = center_rep + (np.random.rand(n,k)-.5)*.01
            Y = np.linalg.qr(Y_raw)[0][:,:k]
            data.append(Y)

        np.random.seed(1)
        Y_init = np.linalg.qr(np.random.rand(n,n))[0][:,:k]
        
        start = time.time()

        errors = ca.irls_flag(data, k, n_its, 'sine', opt_err = 'sine', fast = False, init = Y_init)[1]

        trial_times.append(time.time()- start)

    

    mean_times.append(np.mean(trial_times))
    std_times.append(np.std(trial_times))

    # iterations.append(len(errors))
    print(str(j)+' done.')

import pandas
trial_stats = pandas.DataFrame(columns = ['n','k','Iterations', 'Time'])
trial_stats['k'] = ks
trial_stats['Mean'] = mean_times
def lbg_subspace(X, epsilon, n_centers=17, opt_type='sine', n_its=10, seed=1):
    n_pts = len(X)
    error = 1
    r = 48
    distortions = []

    #init centers
    np.random.seed(seed)
    centers = []
    for i in range(n_centers):
        centers.append(X[np.random.randint(n_pts)])

    #calculate distance matrix
    d_mat = distance_matrix(X, centers, opt_type)

    #find the closest center for each point
    index = np.argmin(d_mat, axis=0)

    #calculate first distortion
    new_distortion = np.sum(d_mat[index])

    distortions.append(new_distortion)

    errors = []
    while error > epsilon:

        #set new distortion as old one
        old_distortion = new_distortion

        m = len(centers)

        #calculate new centers
        centers = []
        for c in range(m):
            idx = np.where(index == c)[0]
            if len(idx) > 0:
                if opt_type == 'sinesq':
                    centers.append(
                        ca.flag_mean([X[i] for i in idx], r, fast=False))
                else:
                    centers.append(
                        ca.irls_flag([X[i] for i in idx], r, n_its, opt_type,
                                     opt_type)[0])
        #         centers.append(np.mean([X[i] for i in idx], axis = 0))

        #calculate distance matrix
        d_mat = distance_matrix(X, centers, opt_type)

        #find the closest center for each point
        index = np.argmin(d_mat, axis=0)

        #new distortion
        new_distortion = np.sum(d_mat[index])

        distortions.append(new_distortion)

        if new_distortion < 0.00000000001:
            error = 0
        else:
            error = np.abs(new_distortion - old_distortion) / old_distortion
        errors.append(error)
        print(error)

    return centers, errors, distortions
Пример #5
0
def visual_2D(num1, num2):

    k = 1
    n_its = 20

    Process1 = np.vstack(
        [np.random.normal(0, .2, num1),
         np.random.normal(1, .2, num1)])
    if num2 != 0:
        Process2 = np.vstack(
            [np.random.normal(1, .2, num2),
             np.random.normal(0, .2, num2)])
        data_array = np.hstack([Process1, Process2])
    else:
        data_array = Process1

    gr_list = []
    for i in range(data_array.shape[1]):
        point = data_array[:, [i]]
        gr_list.append(point / np.linalg.norm(point))
        plt.plot([-gr_list[i][0, 0], gr_list[i][0, 0]],
                 [-gr_list[i][1, 0], gr_list[i][1, 0]],
                 color='.5',
                 linestyle='dashed')

    flagmean = ca.flag_mean(gr_list, k, fast=False)
    print('Flag Mean finished')

    sin_median = ca.irls_flag(gr_list, k, n_its, 'sine', fast=False)[0]
    print('Sine Median finished')

    geodesic_median = ca.irls_flag(gr_list, k, n_its, 'geodesic',
                                   fast=False)[0]
    print('Geodesic finished')

    max_cosine = ca.irls_flag(gr_list, k, n_its, 'cosine', fast=False)[0]
    print('Max Cos finished')

    l0 = plt.plot([-flagmean[0, 0], flagmean[0, 0]],
                  [-flagmean[1, 0], flagmean[1, 0]],
                  label='Flag Mean',
                  color='b')
    l1 = plt.plot([-sin_median[0, 0], sin_median[0, 0]],
                  [-sin_median[1, 0], sin_median[1, 0]],
                  label='Sine Median',
                  color='g')
    l2 = plt.plot([-geodesic_median[0, 0], geodesic_median[0, 0]],
                  [-geodesic_median[1, 0], geodesic_median[1, 0]],
                  label='Geodesic Median',
                  color='r')
    l3 = plt.plot([-max_cosine[0, 0], max_cosine[0, 0]],
                  [-max_cosine[1, 0], max_cosine[1, 0]],
                  label='Maximum Cosine',
                  color='y')

    plt.xlim(-1, 1)
    plt.ylim(-1, 1)

    plt.savefig('./Figures/2example_2D_' + str(num1) + '_' + str(num2) +
                '.png')
    plt.close()

    return l0, l1, l2, l3
Пример #6
0
def convergence_check(gr_list, n_its):

    [n, k] = gr_list[0].shape

    irls_sin_median = []
    gd_sin_median = []

    # irls_geodesic_median = []
    # gd_geodesic_median = []

    irls_max_cosine = []
    gd_max_cosine = []

    Y_raw = np.random.rand(n, k)
    Y = np.linalg.qr(Y_raw)[0][:, :k]

    for i in range(10):

        irls_sin_median.append(
            ca.irls_flag(gr_list,
                         k,
                         n_its,
                         'sine',
                         opt_err='sine',
                         fast=False,
                         init=Y)[1])
        gd_sin_median.append(
            ca.gradient_descent(gr_list, k, -.01, n_its, 'sine', init=Y)[1])
        print('Sine Median finished')

        # irls_geodesic_median.append(ca.irls_flag(gr_list, k, n_its, 'geodesic', fast = False, init = Y)[1])
        # gd_geodesic_median.append(ca.gradient_descent(gr_list, k, .01, n_its, 'geodesic', init = Y)[1])
        # print('Geodesic finished')

        irls_max_cosine.append(
            ca.irls_flag(gr_list,
                         k,
                         n_its,
                         'cosine',
                         opt_err='cosine',
                         fast=False,
                         init=Y)[1])
        gd_max_cosine.append(
            ca.gradient_descent(gr_list, k, -.01, n_its, 'cosine', init=Y)[1])
        print('Max Cos finished')

    irls_sin_median = np.vstack(irls_sin_median)
    gd_sin_median = np.vstack(gd_sin_median)

    # irls_geodesic_median = np.vstack(irls_geodesic_median)
    # gd_geodesic_median = np.vstack(gd_geodesic_median)

    irls_max_cosine = np.vstack(irls_max_cosine)
    gd_max_cosine = np.vstack(gd_max_cosine)

    #make the plots
    LINESTYLES = ["-", "--", ":", "-."]
    MARKERS = ['D', 'o', 'X', '*', '<', 'd', 'S', '>', 's', 'v']
    COLORS = ['b', 'k', 'c', 'm', 'y']

    add_line(irls_sin_median, LINESTYLES[0], None, 'FlagIRLS', 'b')
    add_line(gd_sin_median, LINESTYLES[2], None, 'Gradient Descent', 'k')
    plt.xticks([0, 1, 2, 3, 4, 5])
    plt.legend()
    plt.xlabel('Iteration')
    plt.ylabel('Objective function value')
    plt.title('Sine Median')
    plt.savefig('./Figures/sin_median_convergence.png')
    plt.close()

    # add_line(irls_geodesic_median, LINESTYLES[0], MARKERS[0], 'IRLS', 'g')
    # add_line(gd_geodesic_median, LINESTYLES[1], MARKERS[1], 'Gradient Descent', 'b')
    # plt.legend()
    # plt.savefig('./Figures/geodesic_median_convergence.png')
    # plt.close()

    add_line(irls_max_cosine, LINESTYLES[3], None, 'FlagIRLS', 'g')
    add_line(gd_max_cosine, LINESTYLES[2], None, 'Gradient Descent', 'k')
    plt.xticks([0, 1, 2, 3, 4, 5])
    plt.legend()
    plt.xlabel('Iteration')
    plt.ylabel('Objective function value')
    plt.title('Maximum Cosine')
    plt.savefig('./Figures/max_cosine_convergence.png')
    plt.close()