def main(_): """High level pipeline. This scripts performs the training and evaling and testing stages for semi-supervised learning using kMeans algorithm. """ # Load dataset. unlabeled_data, _ = io_tools.read_dataset('data/train_no_label.csv') n_dims = unlabeled_data.shape[1] # Initialize model. if FLAGS.model_type == 'kmeans': model = KMeans(n_dims, n_components=FLAGS.n_components, max_iter=FLAGS.max_iter) else: model = GaussianMixtureModel(n_dims, n_components=FLAGS.n_components, max_iter=FLAGS.max_iter) # Unsupervised training. model.fit(unlabeled_data) # Supervised training. train_data, train_label = io_tools.read_dataset(('data/' 'train_with_label.csv')) model.supervised_fit(train_data, train_label) # Eval model. eval_data, eval_label = io_tools.read_dataset('data/val.csv') y_hat_eval = model.supervised_predict(eval_data) acc = np.sum(y_hat_eval == eval_label) / (1.*eval_data.shape[0]) print("Accuracy: %s" % acc)
def genWords(self, X, K): N = X.shape[0] self.sift_features = genSiftFeatures(X) features_list = [] for i in range(N): features_list = features_list + self.sift_features[i] print(np.array(features_list).shape) model = KMeans() self.Words = model.train(np.array(features_list), K) return self.Words
def genFeatures(self, X): N = X.shape[0] D = self.Words.shape[0] y = np.zeros([N, D]) model = KMeans() model.clusters = self.Words for i in range(N): img_words = model.predict(np.array(self.sift_features[i])) y[i] = np.bincount(np.array(img_words), minlength=D) return y
def test_kmeans(): from models.kmeans import KMeans x = np.random.randn(3, 200, 2) x[1] += np.array([2, 2]) # 右偏移2,上偏移2 x[2] += np.array([2, -2]) # 右偏移2,下偏移2 plot_scatter(x, 'Real') x = x.reshape(-1, 2) kmeans = KMeans(3) pred = kmeans.predict(x) centers = kmeans.centers plot_scatter_with_centers([x[pred == i] for i in [0, 1, 2]], centers, 'Pred')
def run(self, argv, charts=False): try: opts, args = getopt.getopt(argv, "n:", ["name="]) except getopt.GetoptError: print( 'Utilize o argumento de nome:\n python3 teste_setup -n <name>' ) sys.exit(2) if (opts == [] or opts[0][0] != "-n" or len(opts) != 1): print( 'Utilize o argumento de nome corretamente:\n python3 teste_setup -n <name>' ) sys.exit() name = opts[0][1] PERSON = name OUTPUTS = [ 'out/{}_results_length.txt'.format(PERSON), 'out/{}_results_nclusters.txt'.format(PERSON) ] for fname in OUTPUTS: open(fname, 'w').close() EXECUTION_TIMES = 1 NUM_COLUMNS = 2 sample_data = [ self.__generate_fake_data(NUM_COLUMNS, y) for y in self.tamanhos ] # testando com diferentes tamanhos de arquivos model = KMeans(n_clusters=4, iterations=50) print('testing with different file lengths') for data in sample_data: print('current: ', data.shape[0]) self.performance_test(OUTPUTS[0], data, EXECUTION_TIMES, model) print('testing with different n_clusters') print('data size: ', self.tamanhos[0]) # testando com diferentes números de clusters for n_clusters in [4, 6, 8, 10, 12]: print('current: ', n_clusters) model = KMeans(n_clusters=n_clusters, iterations=50) self.performance_test(OUTPUTS[1], sample_data[0], EXECUTION_TIMES, model) if (charts): for fname in OUTPUTS: f = np.loadtxt(fname).T fig, ax = plt.subplots() if 'length' in fname: parametros, pcov = scipy_opt.curve_fit(funcao_linear, xdata=self.tamanhos, ydata=f[0]) self.aproximados = [ funcao_linear(x, *parametros) for x in self.tamanhos ] print("aproximados: {}".format(self.aproximados)) print("parametros_otimizados: {}".format(parametros)) print("pcov: {}".format(pcov)) print() plt.suptitle( "Tempo X Tamanho do dataset ({})\n".format(PERSON), fontsize=14) plt.title("Iterações: {} N Clusters: {}".format( int(f[2][0]), int(f[1][0])), fontsize=10) plt.plot(f[3], f[0], '-bo', color='blue', mfc='b', mec='b', markersize=4, linewidth=2) plt.xlim([min(f[3]), max(f[3])]) plt.xlabel("Tamanho") plt.xticks(f[3]) plt.ylim([0, 60]) plt.ylabel("Tempo (s)") plt.plot(self.tamanhos, self.aproximados, '-o', markersize=4, label="Teste", color="red") plt.legend(["k-means medido", "k-means aproximado"]) plt.grid(linestyle='dotted') plt.savefig(fname[:-4] + '.png') # plt.yticks(f[0]) else: self.tamanhos = [4, 6, 8, 10, 12] parametros, pcov = scipy_opt.curve_fit(funcao_linear, xdata=self.tamanhos, ydata=f[0]) self.aproximados = [ funcao_linear(x, *parametros) for x in self.tamanhos ] print("aproximados: {}".format(self.aproximados)) print("parametros_otimizados: {}".format(parametros)) print("pcov: {}".format(pcov)) print() plt.suptitle( "Tempo X Quantidade de clusters ({})\n".format(PERSON), fontsize=14) plt.title("Iterações: {} Tamanho dataset: {}".format( int(f[2][0]), int(f[3][0])), fontsize=10) plt.plot(f[1], f[0], '-mo', color='purple', mfc='m', mec='m', markersize=4, linewidth=2) plt.xlabel("N Clusters") plt.xticks([4, 6, 8, 10, 12]) plt.xlim([min(f[1]), max(f[1])]) plt.ylim([0, 60]) plt.ylabel("Tempo (s)") plt.plot([4, 6, 8, 10, 12], self.aproximados, '-o', markersize=4, label="Teste", color="red") plt.legend(["k-means medido", "k-means aproximado"]) plt.grid(linestyle='dotted') plt.savefig(fname[:-4] + '.png') #plt.yticks(f[0]) self.medias = f[0]
import sys import pathlib import cProfile import time import pandas as pd import numpy as np sys.path.append(str(pathlib.Path().absolute().parent)) from models.kmeans import KMeans from tests.utils import TestUtils if __name__ == '__main__': model = KMeans(n_clusters=4, iterations=5, logging=True) dataset3 = np.loadtxt( 'data/credit_card_customers/bank_churn.txt') # 11 linhas dataset2 = np.loadtxt( 'data/customer_churn/customer_churn_processed.txt') # 7.043 linhas dataset1 = np.loadtxt('data/churn_modelling/test.txt') # 10.000 linhas dataset4 = np.loadtxt('data/cateter.txt') # 11 linhas fake_data = np.loadtxt('data/fake_data.txt') test_utils = TestUtils() # prof = cProfile.Profile() # prof.enable() # prof.run('model.fit(fake_data)') # prof.disable() # test_utils.write_stats_file(prof)
plt.pie(distribution, labels=pie_labels, autopct='%1.1f%%', shadow=True, startangle=90) plt.xlabel( 'Cluster composition (cluster size {})'.format(cluster_size)) plt.show() # ==================== # =================== # KMEANS n_clusteers = 2 model = KMeans(n_clusteers, logging, max_iter=300) model.fit(train_data, train_labels) predictions = model.predict(test_data) # we analyze the composition of each cluster possible_labels = [0, 1] for cluster in range(n_clusteers): cluster_members = {} cluster_size = 0 for idx in range(len(predictions)): # check if it's cluster member if predictions[idx] == cluster: # add real value cluster_members[test_labels[idx]] = cluster_members.get(
def run(self, charts=False): ''' ======================================= Antes de rodar o teste, coloque seu''' ###################################### PERSON = '\x00' ###################### ###################################### ''' nome para gerar os arquivos de acordo. =========================================''' OUTPUTS = [ '../out/{}_results_length.txt'.format(PERSON), '../out/{}_results_nclusters.txt'.format(PERSON) ] for fname in OUTPUTS: open(fname, 'w').close() EXECUTION_TIMES = 1 sample_data_lengths = [5000, 7500, 10000, 12500, 15000] NUM_COLUMNS = 2 sample_data = [ self.__generate_fake_data(NUM_COLUMNS, y) for y in sample_data_lengths ] # testando com diferentes tamanhos de arquivos model = KMeans(n_clusters=4, iterations=50) print('testing with different file lengths') for data in sample_data: print('current: ', data.shape[0]) self.performance_test(OUTPUTS[0], data, EXECUTION_TIMES, model) print('testing with different n_clusters') print('data size: ', sample_data_lengths[0]) # testando com diferentes números de clusters for n_clusters in [4, 6, 8, 10, 12]: print('current: ', n_clusters) model = KMeans(n_clusters=n_clusters, iterations=50) self.performance_test(OUTPUTS[1], sample_data[0], EXECUTION_TIMES, model) if (charts): for fname in OUTPUTS: f = np.loadtxt(fname).T fig, ax = plt.subplots() if 'nclusters' in fname: plt.suptitle( "Tempo X Quantidade de clusters (PC-{})\n".format( PERSON), fontsize=14) plt.title("Iterações: {} Tamanho dataset: {}".format( int(f[2][0]), int(f[3][0])), fontsize=10) plt.bar(f[1], f[0], width=1.2, color="purple") plt.xlabel("N Clusters") plt.xticks(f[1]) plt.ylabel("Tempo (s)") plt.yticks(f[0]) else: plt.suptitle( "Tempo X Tamanho do dataset (PC-{})\n".format(PERSON), fontsize=14) plt.title("Iterações: {} N Clusters: {}".format( int(f[2][0]), int(f[1][0])), fontsize=10) plt.plot(f[0], f[3], '-o', color='blue', mfc='r', mec='r', markersize=8, linewidth=2) plt.ylabel("Tamanho") plt.xticks(f[0]) plt.xlabel("Tempo (s)") plt.yticks(f[3]) plt.savefig(fname + '.png')
from models.kmeans import KMeans import torch import matplotlib.pyplot as plt torch.manual_seed(123) model = KMeans(n_clusters=5, dimension=2) X = torch.randn(100, 2) / 6 model.init_centroids(X) fig, (ax) = plt.subplots(1, constrained_layout=True) ax.set_title('Encoder/Classifier Training Loss') ax.set_xlabel('Epoch') ax.set_ylabel('Loss') #plt.clf() model.fit(X, 20) #print(model.entropy()) colors = ["red", "green", "blue", "orange", "pink", "yellow"] for _, (k, c) in enumerate(model.cluster_objects.items()): for ((x, y), _) in c: ax.scatter(x, y, c=colors[k], cmap='cool') for i, (x, y) in enumerate(list(model.centroids)): ax.scatter(x, y, c='white', alpha=0.6, edgecolors='black', linewidths=2) plt.show() #plt.draw() # plt.pause(1e-5)
fig, (ax) = plt.subplots(1, constrained_layout=True) ax.set_title('Encoder/Classifier Training Loss') ax.set_xlabel('Epoch') ax.set_ylabel('Loss') X = [] with torch.no_grad(): for item_idx, (img, _) in enumerate(random.sample(list(train_def), 1000)): # send to device img = img.to("cuda").unsqueeze(0) img_embedding, _, img_class = model(img) X.append((torch.flatten(img_embedding).cpu(), img.squeeze(0).squeeze(0).cpu())) #X.append((torch.flatten(img).cpu(), img.squeeze(0).squeeze(0).cpu())) #print(X.shape) cluster = KMeans(n_clusters=10) # find best entropy max_entropy = 0 max_centroids = None #for i in range(1): cluster.init_centroids(X) #cluster.centroids = centroid_X cluster.fit(X, 10, dist="cosine") #if cluster.entropy() > max_entropy: # max_entropy = cluster.entropy() # max_centroids = cluster.centroids