def get_codebooks(instances, sizes): codebooks = list() list_sampled_features = list() n = len(instances) for x, instance in enumerate(instances): if x % 10 == 0: print "Processing (%d/%d) %.2f%% Images" % (x, n, 100. * x / n) deses = calc_surf_features(instance)[1] list_sampled_features.append(deses) sampled_features = np.concatenate(list_sampled_features) for size in sizes: np.random.shuffle(sampled_features) subset = sampled_features[:_num_surf_features_codebook] distances = scipy.spatial.distance.pdist(subset, 'cityblock') distances = scipy.spatial.distance.squareform(distances) indices = kmedoids.cluster(distances, k=size, maxIters=_max_k_medoids_iters)[1] codebook = subset[indices] codebooks.append(codebook) return codebooks
def worker(X, k, round): """multithreading worker""" key = methodName + "/" + dataset + "/individuals/" + "/" + methodName + "_" + dataset + "_k_" + str( k) + "_round_" + str(round) file = key + "_labels.csv" if (os.path.exists(file)): return labels, medoids = kmedoids.cluster(X, k) np.savetxt(file, labels, fmt="%d")
def construct_codebook(instances, codebook_size): surf_feature_samples = sample_surf_features(instances) distances = scipy.spatial.distance.pdist(surf_feature_samples, 'cityblock') distances = scipy.spatial.distance.squareform(distances) indices = kmedoids.cluster(distances, k=codebook_size, maxIters=_max_k_medoids_iters)[1] codebook = surf_feature_samples[indices] return codebook
def create_codebook(indir): if _read_cache and os.path.exists(_cache_codebook_file): print "\tReading Cache" f = open(_cache_codebook_file) try: codebook = cPickle.load(f) f.close() print "Done\n" return codebook except Exception as e: print "\tError loading codebook:", e print "\tComputing From scratch" f.close() # sample some files im_files = list() for subdir in os.listdir(indir): rdir = os.path.join(indir, subdir) for f in os.listdir(rdir): if f.endswith(".jpg"): img_file = os.path.join(rdir, f) im_files.append(img_file) codebook_files = random.sample( im_files, int(_perc_docs_for_codebook * len(im_files))) # construct the codebook surfs = np.concatenate( map(lambda x: extract_surf_features(x)[1], codebook_files)) np.random.shuffle(surfs) surfs = surfs[:_max_surf_features] distances = scipy.spatial.distance.pdist(surfs, 'cityblock') distances = scipy.spatial.distance.squareform(distances) indices = kmedoids.cluster(distances, k=_codebook_size, maxIters=_max_k_medoids_iters)[1] codebook = surfs[indices] if _write_cache: print "\tWriting Codebook to Cache" f = open(_cache_codebook_file, 'w') try: cPickle.dump(codebook, f) except Exception as e: print "\tCould not write to cache:", e f.close() print "Done\n" return codebook
def construct_codebook(instances): print "Constructing Codebook" if _read_cache and os.path.exists(_cache_codebook_file): print "\tReading Cache" f = open(_cache_codebook_file) try: codebook = cPickle.load(f) f.close() print "Done\n" return codebook except Exception as e: print "\tError loading codebook:", e print "\tComputing From scratch" f.close() surf_feature_samples = sample_surf_features(instances) print "\tNumber of SURFs for codebook construction: ", surf_feature_samples.shape[ 0] if _use_k_medoids: print "\tComputing Distances" distances = scipy.spatial.distance.pdist(surf_feature_samples, 'cityblock') distances = scipy.spatial.distance.squareform(distances) print "\tDone\n" print "\tRunning Kmedoids" indices = kmedoids.cluster(distances, k=_codebook_size, maxIters=_max_k_medoids_iters)[1] codebook = surf_feature_samples[indices] print "\tDone\n" else: codebook = surf_feature_samples[:_codebook_size] if _write_cache: print "\tWriting Codebook to Cache" f = open(_cache_codebook_file, 'w') try: cPickle.dump(codebook, f) except Exception as e: print "\tCould not write to cache:", e f.close() print "Done\n" return codebook
def create_codebook(indir): if _read_cache and os.path.exists(_cache_codebook_file): print "\tReading Cache" f = open(_cache_codebook_file) try: codebook = cPickle.load(f) f.close() print "Done\n" return codebook except Exception as e: print "\tError loading codebook:", e print "\tComputing From scratch" f.close() # sample some files im_files = list() for subdir in os.listdir(indir): rdir = os.path.join(indir, subdir) for f in os.listdir(rdir): if f.endswith(".jpg"): img_file = os.path.join(rdir, f) im_files.append(img_file) codebook_files = random.sample(im_files, int(_perc_docs_for_codebook * len(im_files))) # construct the codebook surfs = np.concatenate(map(lambda x: extract_surf_features(x)[1], codebook_files)) np.random.shuffle(surfs) surfs = surfs[:_max_surf_features] distances = scipy.spatial.distance.pdist(surfs, 'cityblock') distances = scipy.spatial.distance.squareform(distances) indices = kmedoids.cluster(distances, k=_codebook_size, maxIters=_max_k_medoids_iters)[1] codebook = surfs[indices] if _write_cache: print "\tWriting Codebook to Cache" f = open(_cache_codebook_file, 'w') try: cPickle.dump(codebook, f) except Exception as e: print "\tCould not write to cache:", e f.close() print "Done\n" return codebook
def get_codebooks(instances, sizes): codebooks = list() list_sampled_features = list() n = len(instances) for x,instance in enumerate(instances): if x % 10 == 0: print "Processing (%d/%d) %.2f%% Images" % (x, n, 100. * x / n) deses = calc_surf_features(instance)[1] list_sampled_features.append(deses) sampled_features = np.concatenate(list_sampled_features) for size in sizes: np.random.shuffle(sampled_features) subset = sampled_features[:_num_surf_features_codebook] distances = scipy.spatial.distance.pdist(subset, 'cityblock') distances = scipy.spatial.distance.squareform(distances) indices = kmedoids.cluster(distances, k=size, maxIters=_max_k_medoids_iters)[1] codebook = subset[indices] codebooks.append(codebook) return codebooks
class cleavage_site(object): index = 0 name = "" structure = "" def _init(self): """""" cleavage_site_list = pickle.load(open("F:\\JIC\\FoldTest\\cleavage_site_list.pickle","r")) subset = cleavage_site_list[0:16] upper_triangle = [] distances = np.zeros((16, 16)) for i in range(0,16):#column for j in range(i + 1,16): distance = RNADistance.RNADistance(subset[i].structure,subset[j].structure) * 1.0 + random.random() distances[i,j] = distance distances[j,i] = distance upper_triangle.append(distance) distances = np.loadtxt("U:\\JICWork\\rna_distance_matrix_71_wt.txt") import kmedoids points, medoids = kmedoids.cluster(distances,3)
# # Fazer um teste com mini-batch k-means # # miniKmeans = MiniBatchKMeans(n_clusters=100) label_mini = miniKmeans.fit_predict(X_treino) print('Mudando para MiniBatchKMeans temos: ', silhouette_score(X_treino, label_mini), '\n') # MEDOIDS # https://github.com/salspaugh/machine_learning/blob/master/clustering/kmedoids.py distancia_treino = pairwise_distances(X_treino, metric='euclidean') clusters, medoids = kmedoids.cluster(distancia_treino, 100) print('Numero de clusters: ', i) print('Mudando para K-Medoids temos: ', silhouette_score(distancia_treino, clusters), '\n') # APLICANDO PCA # # for i in range(1, 100): pca = PCA(n_components=i / 100, svd_solver='full') X_treino_pca = pca.fit_transform(X_treino) kmeans_pca = KMeans(n_clusters=78) label_teste = kmeans_pca.fit_predict(X_treino_pca) #print('Aplicando PCA no nosso melhor modelo com clusters:', 78, 'e variancia: ', i/10)
def cluster_rows_in_k(kdata, k): distances = compute_distances_in_k(kdata) return kmedoids.cluster(distances, k=k)
for round in range(0, 6): #method = KMeans(n_clusters = 3) #method.fit(data) #centroids = method.cluster_centers_ #plt.scatter(centroids[:, 0], centroids[:, 1], color = colors[round],s=169, marker = markers[round], linewidths=3, facecolor="None") #labels = method.labels_ #centroids = [] #for i in range(0,method.n_clusters): # center = np.mean( data[labels == i],0) # centroids.append(center) # plt.scatter(center[0], center[1], color = colors[round],s=169, marker =markers[round], linewidths=3, facecolor="None") #plt.scatter(centroids[:, 0], centroids[:, 1], color = colors[i], s=169, marker = "x", linewidths=3) labels, medoids = kmedoids.cluster(distances, 3) m_x = [] m_y = [] for m in medoids: m_x.append(data[m][0]) m_y.append(data[m][1]) plt.scatter(m_x, m_y, color=colors[round], s=169, marker=markers[round], linewidths=3, facecolor="None") plt.title('spectral clustering and evenly distributed data') #plt.xlim(x_min, x_max)