Пример #1
0
 def genWords(self, X, K):
     N = X.shape[0]
     self.sift_features = genSiftFeatures(X)
     features_list = []
     for i in range(N):
         features_list = features_list + self.sift_features[i]
     print(np.array(features_list).shape)
     model = KMeans()
     self.Words = model.train(np.array(features_list), K)
     return self.Words
Пример #2
0
 def genFeatures(self, X):
     N = X.shape[0]
     D = self.Words.shape[0]
     y = np.zeros([N, D])
     model = KMeans()
     model.clusters = self.Words
     for i in range(N):
         img_words = model.predict(np.array(self.sift_features[i]))
         y[i] = np.bincount(np.array(img_words), minlength=D)
     return y
Пример #3
0
def main(_):
    """High level pipeline.

    This scripts performs the training and evaling and testing stages for
    semi-supervised learning using kMeans algorithm.
    """
    # Load dataset.
    unlabeled_data, _ = io_tools.read_dataset('data/train_no_label.csv')
    n_dims = unlabeled_data.shape[1]

    # Initialize model.
    if FLAGS.model_type == 'kmeans':
        model = KMeans(n_dims, n_components=FLAGS.n_components,
                       max_iter=FLAGS.max_iter)
    else:
        model = GaussianMixtureModel(n_dims, n_components=FLAGS.n_components,
                                     max_iter=FLAGS.max_iter)

    # Unsupervised training.
    model.fit(unlabeled_data)

    # Supervised training.
    train_data, train_label = io_tools.read_dataset(('data/'
                                                     'train_with_label.csv'))
    model.supervised_fit(train_data, train_label)

    # Eval model.
    eval_data, eval_label = io_tools.read_dataset('data/val.csv')
    y_hat_eval = model.supervised_predict(eval_data)

    acc = np.sum(y_hat_eval == eval_label) / (1.*eval_data.shape[0])
    print("Accuracy: %s" % acc)
Пример #4
0
def test_kmeans():
    from models.kmeans import KMeans

    x = np.random.randn(3, 200, 2)
    x[1] += np.array([2, 2])  # 右偏移2,上偏移2
    x[2] += np.array([2, -2])  # 右偏移2,下偏移2

    plot_scatter(x, 'Real')
    x = x.reshape(-1, 2)

    kmeans = KMeans(3)
    pred = kmeans.predict(x)
    centers = kmeans.centers

    plot_scatter_with_centers([x[pred == i] for i in [0, 1, 2]], centers,
                              'Pred')
Пример #5
0
    def run(self, argv, charts=False):
        try:
            opts, args = getopt.getopt(argv, "n:", ["name="])
        except getopt.GetoptError:
            print(
                'Utilize o argumento de nome:\n  python3 teste_setup -n <name>'
            )
            sys.exit(2)
        if (opts == [] or opts[0][0] != "-n" or len(opts) != 1):
            print(
                'Utilize o argumento de nome corretamente:\n  python3 teste_setup -n <name>'
            )
            sys.exit()

        name = opts[0][1]

        PERSON = name

        OUTPUTS = [
            'out/{}_results_length.txt'.format(PERSON),
            'out/{}_results_nclusters.txt'.format(PERSON)
        ]
        for fname in OUTPUTS:
            open(fname, 'w').close()

        EXECUTION_TIMES = 1

        NUM_COLUMNS = 2
        sample_data = [
            self.__generate_fake_data(NUM_COLUMNS, y) for y in self.tamanhos
        ]

        # testando com diferentes tamanhos de arquivos
        model = KMeans(n_clusters=4, iterations=50)
        print('testing with different file lengths')

        for data in sample_data:
            print('current: ', data.shape[0])
            self.performance_test(OUTPUTS[0], data, EXECUTION_TIMES, model)

        print('testing with different n_clusters')
        print('data size: ', self.tamanhos[0])
        # testando com diferentes números de clusters
        for n_clusters in [4, 6, 8, 10, 12]:
            print('current: ', n_clusters)
            model = KMeans(n_clusters=n_clusters, iterations=50)
            self.performance_test(OUTPUTS[1], sample_data[0], EXECUTION_TIMES,
                                  model)

        if (charts):
            for fname in OUTPUTS:
                f = np.loadtxt(fname).T
                fig, ax = plt.subplots()

                if 'length' in fname:
                    parametros, pcov = scipy_opt.curve_fit(funcao_linear,
                                                           xdata=self.tamanhos,
                                                           ydata=f[0])
                    self.aproximados = [
                        funcao_linear(x, *parametros) for x in self.tamanhos
                    ]
                    print("aproximados:           {}".format(self.aproximados))
                    print("parametros_otimizados: {}".format(parametros))
                    print("pcov:                  {}".format(pcov))
                    print()

                    plt.suptitle(
                        "Tempo X Tamanho do dataset ({})\n".format(PERSON),
                        fontsize=14)
                    plt.title("Iterações: {} N Clusters: {}".format(
                        int(f[2][0]), int(f[1][0])),
                              fontsize=10)
                    plt.plot(f[3],
                             f[0],
                             '-bo',
                             color='blue',
                             mfc='b',
                             mec='b',
                             markersize=4,
                             linewidth=2)
                    plt.xlim([min(f[3]), max(f[3])])
                    plt.xlabel("Tamanho")
                    plt.xticks(f[3])
                    plt.ylim([0, 60])
                    plt.ylabel("Tempo (s)")
                    plt.plot(self.tamanhos,
                             self.aproximados,
                             '-o',
                             markersize=4,
                             label="Teste",
                             color="red")
                    plt.legend(["k-means medido", "k-means aproximado"])
                    plt.grid(linestyle='dotted')
                    plt.savefig(fname[:-4] + '.png')
                    # plt.yticks(f[0])

                else:
                    self.tamanhos = [4, 6, 8, 10, 12]
                    parametros, pcov = scipy_opt.curve_fit(funcao_linear,
                                                           xdata=self.tamanhos,
                                                           ydata=f[0])
                    self.aproximados = [
                        funcao_linear(x, *parametros) for x in self.tamanhos
                    ]
                    print("aproximados:           {}".format(self.aproximados))
                    print("parametros_otimizados: {}".format(parametros))
                    print("pcov:                  {}".format(pcov))
                    print()
                    plt.suptitle(
                        "Tempo X Quantidade de clusters ({})\n".format(PERSON),
                        fontsize=14)
                    plt.title("Iterações: {} Tamanho dataset: {}".format(
                        int(f[2][0]), int(f[3][0])),
                              fontsize=10)
                    plt.plot(f[1],
                             f[0],
                             '-mo',
                             color='purple',
                             mfc='m',
                             mec='m',
                             markersize=4,
                             linewidth=2)
                    plt.xlabel("N Clusters")
                    plt.xticks([4, 6, 8, 10, 12])
                    plt.xlim([min(f[1]), max(f[1])])
                    plt.ylim([0, 60])
                    plt.ylabel("Tempo (s)")
                    plt.plot([4, 6, 8, 10, 12],
                             self.aproximados,
                             '-o',
                             markersize=4,
                             label="Teste",
                             color="red")
                    plt.legend(["k-means medido", "k-means aproximado"])
                    plt.grid(linestyle='dotted')
                    plt.savefig(fname[:-4] + '.png')

                    #plt.yticks(f[0])

                self.medias = f[0]
Пример #6
0
import sys
import pathlib
import cProfile
import time

import pandas as pd
import numpy as np

sys.path.append(str(pathlib.Path().absolute().parent))

from models.kmeans import KMeans
from tests.utils import TestUtils

if __name__ == '__main__':
    model = KMeans(n_clusters=4, iterations=5, logging=True)
    dataset3 = np.loadtxt(
        'data/credit_card_customers/bank_churn.txt')  # 11 linhas
    dataset2 = np.loadtxt(
        'data/customer_churn/customer_churn_processed.txt')  # 7.043 linhas
    dataset1 = np.loadtxt('data/churn_modelling/test.txt')  # 10.000 linhas
    dataset4 = np.loadtxt('data/cateter.txt')  # 11 linhas
    fake_data = np.loadtxt('data/fake_data.txt')

    test_utils = TestUtils()

    # prof = cProfile.Profile()
    # prof.enable()
    # prof.run('model.fit(fake_data)')
    # prof.disable()

    # test_utils.write_stats_file(prof)
Пример #7
0
        plt.pie(distribution,
                labels=pie_labels,
                autopct='%1.1f%%',
                shadow=True,
                startangle=90)
        plt.xlabel(
            'Cluster composition (cluster size {})'.format(cluster_size))
        plt.show()

    # ====================

    # ===================
    # KMEANS

    n_clusteers = 2
    model = KMeans(n_clusteers, logging, max_iter=300)

    model.fit(train_data, train_labels)

    predictions = model.predict(test_data)

    # we analyze the composition of each cluster
    possible_labels = [0, 1]
    for cluster in range(n_clusteers):
        cluster_members = {}
        cluster_size = 0
        for idx in range(len(predictions)):
            # check if it's cluster member
            if predictions[idx] == cluster:
                # add real value
                cluster_members[test_labels[idx]] = cluster_members.get(
Пример #8
0
    def run(self, charts=False):
        '''
    =======================================
    Antes de rodar o teste, coloque seu'''
        ######################################
        PERSON = '\x00'  ######################
        ######################################
        ''' nome para gerar os arquivos de acordo.
    ========================================='''
        OUTPUTS = [
            '../out/{}_results_length.txt'.format(PERSON),
            '../out/{}_results_nclusters.txt'.format(PERSON)
        ]
        for fname in OUTPUTS:
            open(fname, 'w').close()

        EXECUTION_TIMES = 1

        sample_data_lengths = [5000, 7500, 10000, 12500, 15000]
        NUM_COLUMNS = 2
        sample_data = [
            self.__generate_fake_data(NUM_COLUMNS, y)
            for y in sample_data_lengths
        ]

        # testando com diferentes tamanhos de arquivos
        model = KMeans(n_clusters=4, iterations=50)
        print('testing with different file lengths')

        for data in sample_data:
            print('current: ', data.shape[0])
            self.performance_test(OUTPUTS[0], data, EXECUTION_TIMES, model)

        print('testing with different n_clusters')
        print('data size: ', sample_data_lengths[0])
        # testando com diferentes números de clusters
        for n_clusters in [4, 6, 8, 10, 12]:
            print('current: ', n_clusters)
            model = KMeans(n_clusters=n_clusters, iterations=50)
            self.performance_test(OUTPUTS[1], sample_data[0], EXECUTION_TIMES,
                                  model)

        if (charts):
            for fname in OUTPUTS:
                f = np.loadtxt(fname).T

                fig, ax = plt.subplots()
                if 'nclusters' in fname:
                    plt.suptitle(
                        "Tempo X Quantidade de  clusters (PC-{})\n".format(
                            PERSON),
                        fontsize=14)
                    plt.title("Iterações: {}  Tamanho dataset: {}".format(
                        int(f[2][0]), int(f[3][0])),
                              fontsize=10)
                    plt.bar(f[1], f[0], width=1.2, color="purple")
                    plt.xlabel("N Clusters")
                    plt.xticks(f[1])
                    plt.ylabel("Tempo (s)")
                    plt.yticks(f[0])
                else:
                    plt.suptitle(
                        "Tempo X Tamanho do dataset (PC-{})\n".format(PERSON),
                        fontsize=14)
                    plt.title("Iterações: {}  N Clusters: {}".format(
                        int(f[2][0]), int(f[1][0])),
                              fontsize=10)
                    plt.plot(f[0],
                             f[3],
                             '-o',
                             color='blue',
                             mfc='r',
                             mec='r',
                             markersize=8,
                             linewidth=2)
                    plt.ylabel("Tamanho")
                    plt.xticks(f[0])
                    plt.xlabel("Tempo (s)")
                    plt.yticks(f[3])

                plt.savefig(fname + '.png')
Пример #9
0
from models.kmeans import KMeans
import torch

import matplotlib.pyplot as plt

torch.manual_seed(123)

model = KMeans(n_clusters=5, dimension=2)
X = torch.randn(100, 2) / 6
model.init_centroids(X)

fig, (ax) = plt.subplots(1, constrained_layout=True)
ax.set_title('Encoder/Classifier Training Loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')

#plt.clf()
model.fit(X, 20)
#print(model.entropy())

colors = ["red", "green", "blue", "orange", "pink", "yellow"]

for _, (k, c) in enumerate(model.cluster_objects.items()):
    for ((x, y), _) in c:
        ax.scatter(x, y, c=colors[k], cmap='cool')

for i, (x, y) in enumerate(list(model.centroids)):
    ax.scatter(x, y, c='white', alpha=0.6, edgecolors='black', linewidths=2)
plt.show()
#plt.draw()
# plt.pause(1e-5)
Пример #10
0
fig, (ax) = plt.subplots(1, constrained_layout=True)
ax.set_title('Encoder/Classifier Training Loss')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')

X = []
with torch.no_grad():
    for item_idx, (img, _) in enumerate(random.sample(list(train_def), 1000)):
            # send to device
            img = img.to("cuda").unsqueeze(0)
            img_embedding, _, img_class = model(img)
            X.append((torch.flatten(img_embedding).cpu(), img.squeeze(0).squeeze(0).cpu()))
            #X.append((torch.flatten(img).cpu(), img.squeeze(0).squeeze(0).cpu()))
#print(X.shape)

cluster = KMeans(n_clusters=10)

# find best entropy
max_entropy = 0
max_centroids = None

#for i in range(1):
cluster.init_centroids(X)
#cluster.centroids = centroid_X
cluster.fit(X, 10, dist="cosine")
    
    
    #if cluster.entropy() > max_entropy:
    #    max_entropy = cluster.entropy()
    #    max_centroids = cluster.centroids