def main():
    clust_num = 3
    data_shape = 2

    mu_list, sigma_list, w_list = get_sparse_gmm_model(clust_num, data_shape)

    spsa_gamma = 1. / 6
    spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma)
    spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4))

    # spsa_alpha = lambda x: 0.001
    # spsa_beta = lambda x: 0.001

    clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_num, data_shape=data_shape, Gammas=None, alpha=spsa_alpha,
                                beta=spsa_beta, norm_init=False, verbose=False, sparse=False, eta=None)

    N = 3000
    data_set = []
    true_labels = []
    for _ in range(N):
        mix_ind = np.random.choice(len(w_list), p=w_list)
        data_point = np.random.multivariate_normal(mu_list[mix_ind], np.identity(data_shape) * sigma_list[mix_ind])
        data_set.append(data_point)
        true_labels.append(mix_ind)
        clustering.fit(data_point)
    data_set = np.array(data_set)

    dataset_name = 'good'
    dataset_dir = os.path.join('datasets', dataset_name)
    if not os.path.isdir(dataset_dir):
        os.mkdir(dataset_dir)

    np.save(os.path.join(dataset_dir, 'data.npy'), data_set)
    np.save(os.path.join(dataset_dir, 'true.npy'), np.array(true_labels))

    param = {'mu': mu_list, 'sigma': sigma_list, 'w': w_list}
    with open(os.path.join(dataset_dir, 'param.pickle'), 'wb') as f:
        pickle.dump(param, f)

    utils.order_clust_centers(np.array(mu_list), clustering)

    clustering.clusters_fill(data_set)
    ari_spsa = metrics.adjusted_rand_score(true_labels, clustering.labels_)

    print('ARI: {}'.format(ari_spsa))
    print('Mean centers dist: {}'.format(utils.mean_cent_dist(np.array(mu_list), clustering)))

    utils.plot_centers(np.array(mu_list), clustering)
    # utils.plot_centers_converg(np.array(mu_list), clustering)

    utils.plot_clustering(data_set, clustering.labels_, 'SPSA clustering partition')
    utils.plot_clustering(data_set, true_labels, 'True partition')

    plt.show()
def load_experiment(name='bad'):
    dataset_dir = os.path.join('datasets', name)

    data_set = np.load(os.path.join(dataset_dir, 'data.npy'))
    true_labels = np.load(os.path.join(dataset_dir, 'true.npy'))

    with open(os.path.join(dataset_dir, 'param.pickle'), 'rb') as f:
        param = pickle.load(f)

    mu_list, sigma_list, w_list = param['mu'], param['sigma'], param['w']

    clust_num = len(mu_list)
    data_shape = data_set[0].shape[0]

    spsa_gamma = 1. / 6
    spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma)
    spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4))

    clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_num, data_shape=data_shape, Gammas=None,
                                                alpha=spsa_alpha,
                                                beta=spsa_beta, norm_init=False, verbose=False, sparse=False, eta=None,
                                                spsa_sigma=False)

    rand_ind = np.random.permutation(data_set.shape[0])

    for i in rand_ind:
        clustering.fit(data_set[i])

    # utils.order_clust_centers(np.array(mu_list), clustering)

    clustering.clusters_fill(data_set[rand_ind])
    ari_spsa = metrics.adjusted_rand_score(true_labels[rand_ind], clustering.labels_)

    print('ARI: {}'.format(ari_spsa))
    print('Mean centers dist: {}'.format(utils.mean_cent_dist(np.array(mu_list), clustering)))

    utils.plot_centers(np.array(mu_list), clustering)
    # utils.plot_centers_converg(np.array(mu_list), clustering)

    # utils.plot_clustering(data_set[rand_ind], clustering.labels_, 'SPSA clustering partition')
    # utils.plot_clustering(data_set[rand_ind], true_labels[rand_ind], 'True partition')

    # for Gamma in clustering.Gammas:
    #     print(Gamma)

    # for center in clustering.cluster_centers_:
    #     print(center)

    # utils.plot_clustering_cov(data_set, clustering.labels_, 'SPSA clustering partition', clustering.cluster_centers_,
    #                           clustering.Gammas)

    plt.show()
    ari_kmeans = np.zeros(n_run)
    ari_spsa = np.zeros(n_run)
    ari_spsa_cov = np.zeros(n_run)
    ari_gmm = np.zeros(n_run)
    ari_pam = np.zeros(n_run)

    centers_dist_kmeans = np.zeros(n_run)
    centers_dist_spsa = np.zeros(n_run)
    centers_dist_spsa_cov = np.zeros(n_run)
    centers_dist_gmm = np.zeros(n_run)

    for i in range(n_run):
        print('Run {0}'.format(i))

        clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_means.shape[0], data_shape=2, Gammas=None,
                                                    alpha=spsa_alpha,
                                                    beta=spsa_beta, norm_init=False, verbose=False, noise=noises[j])

        clustering_cov = spsa_clustering.ClusteringSPSA(n_clusters=clust_means.shape[0], data_shape=2, Gammas=None,
                                                        alpha=spsa_alpha,
                                                        beta=spsa_beta, norm_init=False, noise=noises[j],
                                                        eta=3000, verbose=False)

        data_set = []
        true_labels = []
        for _ in range(N):
            mix_ind = np.random.choice(len(mix_prob), p=mix_prob)
            data_point = np.random.multivariate_normal(clust_means[mix_ind], clust_gammas[mix_ind])
            data_set.append(data_point)
            true_labels.append(mix_ind)
            # clustering.fit(data_point)
# data_generator.save_example()

train_generator = data_generator.generate('train')

centers_fname = '/home/a.boiarov/Projects/spsa_clustering_gmm_log/centers.npy'

if not args.only_clf:
    spsa_gamma = 1. / 6
    spsa_alpha = lambda x: 0.25 / (x**spsa_gamma)
    spsa_beta = lambda x: 15. / (x**(spsa_gamma / 4))
    n_filters = 500

    clustering = spsa_clustering.ClusteringSPSA(n_clusters=n_filters,
                                                data_shape=patch_size *
                                                patch_size,
                                                Gammas=None,
                                                alpha=spsa_alpha,
                                                beta=spsa_beta,
                                                norm_init=False,
                                                eta=900)

    # spsa_train_num = data_generator.train_number
    spsa_train_num = 1500

    num = 0
    for _ in range(spsa_train_num):
        print(num)
        num += 1
        train_data = next(train_generator)
        for patch in train_data[0]:
            patch = patch.flatten()
            clustering.fit(patch)
noise_5 = spsa_clustering.Noise(func=lambda x: 10 * (np.random.rand() * 4 - 2),
                name='random')
noise_6 = spsa_clustering.Noise(func=lambda x: 0.1 * np.sin(x) + 19 * np.sign(50 - x % 100),
                name='irregular')
noise_7 = spsa_clustering.Noise(func=lambda x: 20, name='constant')

experiment_noise = noise_0

spsa_gamma = 1. / 6
spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma)
spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4))

# spsa_alpha = lambda x: 0.0001
# spsa_beta = lambda x: 0.0001

clustering = spsa_clustering.ClusteringSPSA(n_clusters=10, data_shape=784, Gammas=None, alpha=spsa_alpha,
                                            beta=spsa_beta, norm_init=False, noise=experiment_noise)

data_set = []
true_labels = []

# init_ind = []
# for label in range(10):
#     ind = np.random.choice(df.index[df['label'] == label].tolist(), 1)
#     row = df.loc[ind[0], :]
#     true_labels.append(row[0])
#     data_point = np.array(row[1:].tolist(), dtype=float)
#     data_set.append(data_point)
#     clustering.fit(data_point)
#     init_ind.append(ind)

index = list(range(df.shape[0]))
示例#6
0
clust_means = np.array([[0, 0], [2, 2], [-3, 6]])
clust_gammas = np.array([[[1, -0.7], [-0.7, 1]],
                         np.eye(2), [[1, 0.8], [0.8, 1]]])
data_set = []
true_labels = []

spsa_gamma = 1. / 6
spsa_alpha = lambda x: 0.25 / (x**spsa_gamma)
spsa_beta = lambda x: 15. / (x**(spsa_gamma / 4))

# spsa_alpha = lambda x: 0.001
# spsa_beta = lambda x: 0.001

clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_means.shape[0],
                                            data_shape=2,
                                            Gammas=None,
                                            alpha=spsa_alpha,
                                            beta=spsa_beta,
                                            norm_init=False)

for _ in range(N):
    mix_ind = np.random.choice(len(mix_prob), p=mix_prob)
    data_point = np.random.multivariate_normal(clust_means[mix_ind],
                                               clust_gammas[mix_ind])
    data_set.append(data_point)
    true_labels.append(mix_ind)
    clustering.fit(data_point)
data_set = np.array(data_set)

utils.order_clust_centers(clust_means, clustering)
clustering.clusters_fill(data_set)
示例#7
0
noise_7 = spsa_clustering.Noise(func=lambda x: [20] * x.shape[0],
                                name='constant')

experiment_noise = noise_3

spsa_gamma = 1. / 6
spsa_alpha = lambda x: 0.25 / (x**spsa_gamma)
spsa_beta = lambda x: 15. / (x**(spsa_gamma / 4))

# spsa_alpha = lambda x: 0.001
# spsa_beta = lambda x: 0.001

clustering = spsa_clustering.ClusteringSPSA(n_clusters=3,
                                            data_shape=2,
                                            Gammas=None,
                                            alpha=spsa_alpha,
                                            beta=spsa_beta,
                                            norm_init=False,
                                            noise=experiment_noise,
                                            eta=None)

clustering_cov = spsa_clustering.ClusteringSPSA(n_clusters=3,
                                                data_shape=2,
                                                Gammas=None,
                                                alpha=spsa_alpha,
                                                beta=spsa_beta,
                                                norm_init=False,
                                                noise=experiment_noise,
                                                eta=1000)

for _ in range(N):
    mix_ind = np.random.choice(len(mix_prob), p=mix_prob)
def stat():
    clust_num = 3
    data_shape = 2

    mu_list, sigma_list, w_list = get_sparse_gmm_model(clust_num, data_shape)

    spsa_gamma = 1. / 6
    spsa_alpha = lambda x: 0.25 / (x ** spsa_gamma)
    spsa_beta = lambda x: 15. / (x ** (spsa_gamma / 4))

    # spsa_alpha = lambda x: 0.001
    # spsa_beta = lambda x: 0.001

    n_run = 10
    N = 3000

    ari_spsa = np.zeros(n_run)
    ari_kmeans = np.zeros(n_run)
    ari_mb_kmeans = np.zeros(n_run)
    ari_pam = np.zeros(n_run)

    cent_dist = np.zeros(n_run)
    cent_dist_kmeans = np.zeros(n_run)
    cent_dist_mb_kmeans = np.zeros(n_run)
    cent_dist_pam = np.zeros(n_run)

    for i in tqdm(range(n_run)):
        clustering = spsa_clustering.ClusteringSPSA(n_clusters=clust_num, data_shape=data_shape, Gammas=None,
                                                    alpha=spsa_alpha,
                                                    beta=spsa_beta, norm_init=False, verbose=False, sparse=True, eta=700,
                                                    spsa_sigma=False)

        kmeans = cluster.KMeans(n_clusters=clust_num)
        mb_kmeans = cluster.MiniBatchKMeans(n_clusters=clust_num, n_init=1, init='random', max_iter=1,
                                            batch_size=1,
                                            max_no_improvement=None)

        data_set = []
        true_labels = []
        for _ in range(N):
            mix_ind = np.random.choice(len(w_list), p=w_list)
            data_point = np.random.multivariate_normal(mu_list[mix_ind], np.identity(data_shape) * sigma_list[mix_ind])
            data_set.append(data_point)
            true_labels.append(mix_ind)
            # clustering.fit(data_point)
        data_set = np.array(data_set)

        # utils.order_clust_centers(np.array(mu_list), clustering)

        # clustering.clusters_fill(data_set)

        labels_pred_kmenas = kmeans.fit_predict(data_set)
        labels_pred_mb_kmeans = mb_kmeans.fit_predict(data_set)

        dist = pairwise_distances(data_set)
        labels_pred_pam, pam_med = pam.cluster(dist, k=clust_num)

        # ari_spsa[i] = metrics.adjusted_rand_score(true_labels, clustering.labels_)
        # cent_dist[i] = utils.mean_cent_dist(np.array(mu_list), clustering)

        ari_kmeans[i] = metrics.adjusted_rand_score(true_labels, labels_pred_kmenas)
        ari_mb_kmeans[i] = metrics.adjusted_rand_score(true_labels, labels_pred_mb_kmeans)
        ari_pam[i] = metrics.adjusted_rand_score(true_labels, labels_pred_pam)

        cent_dist_kmeans[i] = utils.mean_cent_dist_(np.array(mu_list), kmeans.cluster_centers_)
        cent_dist_mb_kmeans[i] = utils.mean_cent_dist_(np.array(mu_list), mb_kmeans.cluster_centers_)
        cent_dist_pam[i] = utils.mean_cent_dist_(np.array(mu_list), data_set[pam_med])

    print(ari_spsa.mean(), cent_dist.mean())

    print('\nMean ARI k-means: {:f}, Mean L2: {:f}'.format(ari_kmeans.mean(), cent_dist_kmeans.mean()))
    print('Mean ARI online k-means: {:f}, Mean L2: {:f}'.format(ari_mb_kmeans.mean(), cent_dist_mb_kmeans.mean()))
    # print('Mean ARI SPSA clustering: {:f}, Mean L2: {:f}'.format(ari_spsa.mean(), cen))
    print('\nMean ARI PAM: {:f}, Mean L2: {:f}'.format(ari_pam.mean(), cent_dist_pam.mean()))