def LexicalFeatures(): """ Compute feature vectors for word and punctuation features """ num_chapters = len(chapters) fvs_lexical = np.zeros((len(chapters), 3), np.float64) fvs_punct = np.zeros((len(chapters), 3), np.float64) for e, ch_text in enumerate(chapters): # note: the nltk.word_tokenize includes punctuation tokens = nltk.word_tokenize(ch_text.lower()) words = word_tokenizer.tokenize(ch_text.lower()) sentences = sentence_tokenizer.tokenize(ch_text) vocab = set(words) words_per_sentence = np.array([len(word_tokenizer.tokenize(s)) for s in sentences]) # average number of words per sentence fvs_lexical[e, 0] = words_per_sentence.mean() # sentence length variation fvs_lexical[e, 1] = words_per_sentence.std() # Lexical diversity fvs_lexical[e, 2] = len(vocab) / float(len(words)) # Commas per sentence fvs_punct[e, 0] = tokens.count(',') / float(len(sentences)) # Semicolons per sentence fvs_punct[e, 1] = tokens.count(';') / float(len(sentences)) # Colons per sentence fvs_punct[e, 2] = tokens.count(':') / float(len(sentences)) # apply whitening to decorrelate the features fvs_lexical = whiten(fvs_lexical) fvs_punct = whiten(fvs_punct) return fvs_lexical, fvs_punct
def test1(self): print "TEST 1:----------------------------------------------------------------" features = np.array([[1.9, 2.3], [1.5, 2.5], [0.8, 0.6], [0.4, 1.8], [0.1, 0.1], [0.2, 1.8], [2.0, 0.5], [0.3, 1.5], [1.0, 1.0]]) whitened = whiten(features) book = np.array((whitened[0], whitened[2])) numpy_result = kmeans(whitened, book)[0] print numpy_result print "" features2 = np.array([[1.9, 2.3,0], [1.5, 2.5,0], [0.8, 0.6,0], [0.4, 1.8,0], [0.1, 0.1,0], [0.2, 1.8,0], [2.0, 0.5,0], [0.3, 1.5,0], [1.0, 1.0,0]]) whitened2 = whiten(features2) book2 = [whitened[0], whitened[2]] our_result = np.array(KMeans.k_means2(whitened2.tolist(), 2, book2).centroids)[:, :-1] print our_result
def clust_scatter(samples, clusters, allocation_table, n): c = len(allocation_table[0]) # Columns r = len(allocation_table) # Rows time_scat_square = 0 mat_scatter = 0 for j in range(0, c): # clusters for t in range(0, 10): # maturities for p in range(0, r): # samples within a cluster index = allocation_table[p, j] if index != 0: time_scat_square += samples.samples[index-1].scatter_maturity[t].scatter mat_scatter += time_scat_square**2 time_scat_square = 0 clusters.clusters[j].scatter = np.sqrt(mat_scatter - 10 * clusters.clusters[j].mean**2) mat_scatter = 0 if n == 0 or n == 4999: print('clust scatter : ' + str(clusters.clusters[j].scatter)) # Normalize clusters' scatter vec = np.zeros(4) for j in range(0, c): vec[j] = clusters.clusters[j].scatter whiten(vec) for j in range(0, c): clusters.clusters[j].scatter = vec[j] return clusters;
def sent_integrate(sim_matrix,n_class): # 次元ごとの分散を均一にする whiten(sim_matrix) centroid, destortion = kmeans(sim_matrix, n_class, iter=100, thresh=1e-05) labels, dist = vq(sim_matrix, centroid) return labels
def parse(data_file_name, predict_index, ignore_indices, **options): data_file = open(data_file_name, 'r') lines = data_file.read().splitlines() x = [] y = [] for i, line in enumerate(lines): if i == 0 or i == 1: continue datas = line.split() x_category = [] for i, data in enumerate(datas): if ignore_indices.has_key(i): continue if i == predict_index: if data == 'T': y.append(1.0) elif data == 'F': y.append(0.0) else: y.append(float(data)) continue x_category.append(float(data)) x.append(x_category) x = whiten(np.array(x)) if options.get('whiten_x') else np.array(x) y = whiten(np.array(y)) if options.get('whiten_y') else np.array(y) x = x - x.mean() if options.get('mean_center_x') else x y = y - y.mean() if options.get('mean_center_y') else y return (x, y)
def compute_bic(self, D, means, labels, K, R): """Computes the Bayesian Information Criterion.""" D = vq.whiten(D) Rn = D.shape[0] M = D.shape[1] if R == K: return 1 # Maximum likelihood estimate (MLE) mle_var = 0 for k in xrange(len(means)): X = D[np.argwhere(labels == k)] X = X.reshape((X.shape[0], X.shape[-1])) for x in X: mle_var += distance.euclidean(x, means[k]) #print x, means[k], mle_var mle_var /= (float(R - K)) # Log-likelihood of the data l_D = - Rn/2. * np.log(2*np.pi) - (Rn * M)/2. * np.log(mle_var) - \ (Rn - K) / 2. + Rn * np.log(Rn) - Rn * np.log(R) # Params of BIC p = (K-1) + M * K + mle_var #print "BIC:", l_D, p, R, K # Return the bic return l_D - p/2. * np.log(R)
def process_options(args): options = argparser().parse_args(args) if options.max_rank is not None and options.max_rank < 1: raise ValueError('max-rank must be >= 1') if options.threshold is not None and options.threshold < 0.0: raise ValueError('threshold must be >= 0') if options.tolerance is not None and options.tolerance < 0.0: raise ValueError('tolerance must be >= 0') if options.approximate and not options.threshold: raise ValueError('approximate only makes sense with a threshold') if options.approximate and options.metric != 'cosine': raise NotImplementedError('approximate only supported for cosine') wv = wvlib.load(options.vectors[0], max_rank=options.max_rank) if options.normalize: logging.info('normalize vectors to unit length') wv.normalize() words, vectors = wv.words(), wv.vectors() if options.whiten: # whitening should be implemented in wvlib to support together with # approximate similarity if options.approximate: raise NotImplemenedError logging.info('normalize features to unit variance') vectors = whiten(vectors) return words, vectors, wv, options
def createdatabase(): X_train = detectcompute(train1) print "Clustering the data with K-means" codebook,distortion = kmeans(whiten(X_train),k) print "Done.\n" imtrain = singledetect(test1) Pdatabase = bow(imtrain,codebook,k) #Pseudo database with list structure #Writing to html.table print "Converting the database into a HTML file" htmltable = open("table.htm","r+") begin = "<htm><body><table cellpadding=5><tr><th>Filename</th><th>Histogram</th></tr>" htmltable.write(begin) for i in range(len(Pdatabase)): middle = "<tr><td>%(filename)s</td><td>%(histogram)s</td></tr>" % {"filename": Pdatabase[i][0], "histogram": Pdatabase[i][-1]} htmltable.write(middle) end = "</table></body></html>" htmltable.write(end) htmltable.close() print "Done.\n" codebook_to_file(codebook)
def bow(images,codebook,clusters): out = images temp = [] print "-"*60 print "Creating the pseudo database." for im in images: c = Counter() bag,dist = vq(whiten(im[1]),codebook) for word in bag: c[word]+=1 #Creating histograms for i in range(clusters): if i in c.iterkeys(): c[i] = c[i]/sum(c.values()) if i not in c.iterkeys(): c[i] = 0 temp.append(c) for i in range(len(temp)): out[i].append(temp[i]) print "Done.\n" return out
def generateCodebook(self, features): """ Generate codebook using extracted features """ codebook = None if self._codebookGenerateMethod == 'k-means': # # Codebook generation using scipy k-means # while run: # try: # # Set missing = 'raise' to raise exception # # when one of the clusters is empty # whitenedFeatures = whiten(features) # codebook, _ = kmeans2(whitenedFeatures, # self._codebookSize, # missing = 'raise') # # # No empty clusters # run = False # except ClusterError: # # If one of the clusters is empty, re-run k-means # run = True # Codebook generation using sklearn k-means whitenedFeatures = whiten(features) kmeans = MiniBatchKMeans(n_clusters = config.codebookSize) kmeans.fit(whitenedFeatures) codebook = kmeans.cluster_centers_ else: pass self._codebook = codebook
def kmeans(d, headers, K, metric, whiten=True, categories=None): '''Takes in a Data object, a set of headers, and the number of clusters to create Computes and returns the codebook, codes and representation errors. If given an Nx1 matrix of categories, it uses the category labels to calculate the initial cluster means. ''' # assign to A the result getting the data given the headers try: A = d.get_data(headers) except AttributeError: A = d if whiten: W = vq.whiten(A) else: W = A codebook = kmeans_init(W, K, categories) # assign to codebook, codes, errors, the result of calling kmeans_algorithm with W and codebook codebook, codes, errors = kmeans_algorithm(W, codebook, metric) # return the codebook, codes, and representation error return codebook, codes, errors
def custom(): _items = {} users = [] for line in open('my_items_likehood.txt'): user, item = keys(line) users.append(user) if item in _items: _items[item].append(user) else: _items[item] = [user] sorted_users = sorted(users) l = len(sorted_users) items={} count=0 features=[] for item in _items: features.append(user_matrix(l, _items[item], sorted_users)) if count == 100: break count += 1 print 'whiten' whitened = whiten(array(features)) print 'kmeans' print kmeans(whitened) print "%d items voted by %d users" % (len(items), len(users))
def normalize(data,mode="pca",n=10): """ normalize and reduce data by PCA""" if mode == "whiten": res = whiten(data) elif mode == "pca": v,P,res = pca_train(data,n,0,1) print v print "eigen ratio is ",v[n-1] / v[0] elif mode == "pca_whiten": v,P,proj = pca_train(data,n,0,1) res = whiten(proj) else: res = np.array(data) return res
def test_kmeans(): obs = sp.random.uniform(0, 10, (1000, 2)) # knum = 7 obs = scvq.whiten(obs) # run kmeans with diffirent number of clusters for knum in range(2, 8): codebook, dist = scvq.kmeans(obs, knum) ind, dist = scvq.vq(obs, codebook) # visualize # plt.ion() plt.ioff() plt.figure(knum) colors = ["b*", "g+", "ro", "yp", "ms", "ch", "wx"] for icluster in range(knum): x = (ind == icluster).nonzero()[0] plt.plot(obs[x, 0], obs[x, 1], colors[icluster]) for iline in range(sp.size(x)): plt.plot([obs[x[iline], 0], codebook[icluster, 0]], [obs[x[iline], 1], codebook[icluster, 1]], "k--") # the cluster centroid plt.plot(codebook[:, 0], codebook[:, 1], "ko") # the plot size plt.xlim((-0.3, 3.8)) plt.ylim((-0.3, 3.8)) plt.show()
def kmeans(dataset, n_cluster = 625): from scipy.cluster.vq import kmeans2, whiten feature_matrix = numpy.asarray(dataset) whitened = whiten(feature_matrix) cluster_num = 625 _, cluster_labels = kmeans2(whitened, cluster_num, iter = 100) return cluster_labels
def _get_jump(feat_array, max_cluster): if max_cluster < 2: max_cluster = self._determine_max_k(feat_array) whitened = whiten(feat_array) # first obtain the covariance matrix of the feature array gamma = np.cov(whitened.T) num_dim = whitened.shape[1] jump = {} distortions_dict = {0: 1} power_fact = -num_dim / 2.0 # Run k mean for all possible number of clusters for k in xrange(1, max_cluster + 1): codebook, _ = kmeans(whitened, k, iter=self.iter) code, _ = vq(whitened, codebook) clusters_dict = self._segment_to_clusters(whitened, code) mahalanobis_dist_list = [] for cid, cvals in clusters_dict.iteritems(): centroid = codebook[cid] cluster_mahalanobis_dist = map( lambda x: self._sq_mahalanobis(x, centroid, gamma), clusters_dict[cid].values) mahalanobis_dist_list.extend(cluster_mahalanobis_dist) this_distortion = np.mean(mahalanobis_dist_list) / num_dim distortions_dict[k] = this_distortion ** power_fact for k in xrange(1, max_cluster + 1): jump[k] = distortions_dict[k] - distortions_dict[k - 1] return jump
def clustering_scipy_kmeans(features, n_clust = 8): """ """ whitened = whiten(features) print whitened.shape initial = [kmeans(whitened,i) for i in np.arange(1,12)] plt.plot([var for (cent,var) in initial]) plt.show() #cent, var = initial[3] ##use vq() to get as assignment for each obs. #assignment,cdist = vq(whitened,cent) #plt.scatter(whitened[:,0], whitened[:,1], c=assignment) #plt.show() codebook, distortion = kmeans(whitened, n_clust) print codebook, distortion assigned_label, dist = vq(whitened, codebook) for ii in range(8): plt.subplot(4,2,ii+1) plt.plot(codebook[ii]) plt.show() centroid, label = kmeans2(whitened, n_clust, minit = 'points') print centroid, label for ii in range(8): plt.subplot(4,2,ii) plt.plot(centroid[ii]) plt.show()
def perform(self): print "Start KMeans" data = whiten(self.seeds)#normalizando os dados self.centro,self.sens = kmeans(data,self.k) self.matrix,_ = vq(data,self.centro) self.resp = self.centro[self.matrix] print "Sensibilidade: "+str(self.sens)
def _get_cluster(self, feat_array, k): # Normalise the feature array whitened = whiten(feat_array) codebook, _ = kmeans(whitened, k, iter=self.iter) code, _ = vq(whitened, codebook) return code
def _get_k_means_centroids(self): whitened_set = whiten(self.training_set) centroids, _ = kmeans(obs = whitened_set, k_or_guess = self.class_number, iter = self.max_iteration_number, thresh = self.training_error) return centroids
def sparse_run(g, pos1): g2 = sparse_graph(g) # pos1 = nx.spring_layout(g) pos2 = nx.spring_layout(g2) features = [] for u in g2.nodes_iter(): # print type(u) # print u # print pos[u] features.append(pos2[u]) print "featurs:", len(features) features = ny.array(features) method = 2 if method == 1: whitened = whiten(features) book = ny.array((whitened[0],whitened[2])) km = kmeans(whitened, book) print km elif method == 2: n_digits = 4 km = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) res = km.fit(features) print len(km.labels_), km.labels_ print res return km.labels_, g2
def kmeans1(): features = array([[ 1.9,2.3], [ 1.5,2.5], [ 0.8,0.6], [ 0.4,1.8], [ 0.1,0.1], [ 0.2,1.8], [ 2.0,0.5], [ 0.3,1.5], [ 1.0,1.0]]) whitened = whiten(features) book = array((whitened[0],whitened[2])) kmeans(whitened,book) (array([[ 2.3110306 , 2.86287398], [ 0.93218041, 1.24398691]]), 0.85684700941625547)
def kmeans2(): features = locations() whitened = whiten(features) book = array((whitened[0],whitened[2])) kmeans(whitened,book) (array([[ 2.3110306 , 2.86287398], [ 0.93218041, 1.24398691]]), 0.85684700941625547)
def k_means_cluster(data_list): if max(data_list[0])-min(data_list[0])>10 and max(data_list[1])-min(data_list[1])>10: array_diagnal=array([[data_list[0][x],data_list[1][x]] for x in range(len(data_list[0]))]) ks = range(1,min([5,len(data_list[0])+1])) KMeans = [cluster.KMeans(n_clusters = i, init="k-means++").fit(array_diagnal) for i in ks] KMeans_predict=[cluster.KMeans(n_clusters = i, init="k-means++").fit_predict(array_diagnal) for i in ks] BIC=[] BIC_rec=[] for x in ks: if KMeans_predict[x-1].max()<x-1: continue else: BIC_i=compute_bic(KMeans[x-1],array_diagnal) if abs(BIC_i)<10**8: BIC.append(BIC_i) BIC_rec.append(x) #BIC = [compute_bic(kmeansi,array_diagnal) for kmeansi in KMeans] #ks_picked=ks[BIC.index(max(BIC))] ks_picked=BIC_rec[BIC.index(max(BIC))] if ks_picked==1: return [data_list] else: out=[] std_rec=[scipy.std(data_list[0]),scipy.std(data_list[1])] whitened = whiten(array_diagnal) centroids, distortion=kmeans(whitened,ks_picked) idx,_= vq(whitened,centroids) for x in range(ks_picked): group1=[[int(i) for i in array_diagnal[idx==x,0]],[int(i) for i in array_diagnal[idx==x,1]]] out.append(group1) return out else: return [data_list]
def kmeansCluster(self, layer, distance, number): import scipy import scipy.cluster.hierarchy as sch from scipy.cluster.vq import vq,kmeans,whiten import numpy as np count = layer.featureCount() self.setProgressRange(count) points = [] for f in layer.getFeatures(): geom = f.geometry() x = geom.asPoint().x() y = geom.asPoint().y() point = [] point.append(x) point.append(y) points.append(point) self.updateProgress() distances = {0:'euclidean', 1:'cityblock', 2:'hamming'} disMat = sch.distance.pdist(points, distances.get(distance))#'euclidean''cityblock''hamming''cosine' Z=sch.linkage(disMat,method='average') P=sch.dendrogram(Z) cluster= sch.fcluster(Z, t=1, criterion='inconsistent') data=whiten(points) centroid=kmeans(data, number)[0] label=vq(data, centroid)[0] return centroid, label
def new_labelled_page(no_of_samples:int, window_size:int, page_scale:int or tuple, labelled_centroids:[tuple], page_paths:[str]): ### Duplication from above weighter = gaussian_weighter(window_size) windower = f.partial(win_centred_on, window=window_size) shifter = f.partial(point_shift, window=window_size) scaler = img_scaler(page_scale) make_observations = compose(prepare_features, real_fft, weighter, std_dev_contrast_stretch) img, label = open_image_label(*page_paths) img, label = scaler(img, label) f_img = prepare_fft_image(img, window_size) access_img = img_accessor(img, identity) access_label = img_accessor(label, identity) access_f_img = img_accessor(f_img, compose(windower, shifter)) ### End of duplication labels = [a[0] for a in labelled_centroids] centroids = np.asarray([a[1] for a in labelled_centroids]) new_label = np.zeros_like(label) for s in img_slices(new_label.shape, 80): unlabelled_samples = sample_all_in_area(s, applier(identity, compose(make_observations, access_f_img))) coords = [a[0] for a in unlabelled_samples] observations = vq.whiten(np.asarray([a[1] for a in unlabelled_samples])) codes, dist = vq.vq(observations, centroids) for i, code in zip(coords, codes): new_label[i] = labels[code] return new_label
def do_cluster(cluster_count, filename): """Use the scipy k-means clustering algorithms to cluster data. Return the item names for the smallest cluster. """ input = Data(filename, -1) d = vq.whiten(input.data.transpose()) codebook, avg_distortion = vq.kmeans(d, cluster_count, 150) codes, distortions = vq.vq(d, codebook) # codes is now a vector of cluster assignments # it is ordered the same as data elements in input c_sizes = {} small_i = 0 if DEBUG: print "Cluster Sizes: ", for i in range(cluster_count): c_sizes[i] = count(codes, i) if DEBUG: print c_sizes[i], if DEBUG: print for i in range(cluster_count): if c_sizes[i] < c_sizes[small_i]: small_i = i if DEBUG: print "Smallest cluster size: " + str(c_sizes[small_i]) return [input._names[i] for i in findall(codes, small_i)]
def cluster(self, graph): """ Take a graph and cluster using the method in "On spectral clusering: analysis and algorithm" by Ng et al., 2001. :param graph: the graph to cluster :type graph: :class:`apgl.graph.AbstractMatrixGraph` :returns: An array of size graph.getNumVertices() of cluster membership """ L = graph.normalisedLaplacianSym() omega, Q = numpy.linalg.eig(L) inds = numpy.argsort(omega) #First normalise rows, then columns standardiser = Standardiser() V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T V = vq.whiten(V) #Using kmeans2 here seems to result in a high variance #in the quality of clustering. Therefore stick to kmeans centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans) clusters, distortion = vq.vq(V, centroids) return clusters
def recognize(wavfn): samplerate, w = wavfile.read(open(wavfn)) mfcc = run_mfcc(samplerate, w, FRAME_SIZE, STEP, NUM_COEFFICIENTS) sample_length = mfcc.shape[0] whitened = vq.whiten(mfcc) def getfile(x): return os.path.join(DATADIR, x) sq_sum_candidates = [] cos_sim_candidates = [] for dirname in os.listdir(DATADIR): codebook_fn = os.path.join(DATADIR, dirname, CODEBOOK_FN) if not os.path.isfile(codebook_fn): continue codebook, dist_1 = numpy.load(open(codebook_fn, 'rb')) code, dist = vq.vq(whitened, codebook) sq_sum_candidates.append((sum(dist*dist)/sample_length, dirname)) cos_dist = [] for c, d, w in zip(code, dist, whitened): cdist = cosine_distance(codebook[c], w) cos_dist.append(cdist) cdista = numpy.array(cos_dist) cos_sim_candidates.append((sum(cdista)/sample_length, dirname)) #print 'Order by square-sum error ascending:' #for score, person in sorted(sq_sum_candidates): #print '\t', score, person print 'Cosine similarity' for score, person in sorted(cos_sim_candidates, reverse=True): print '\t', score, person
def manfredor(list_obj, rules, num_cluster=10): score_list = [] for obj in list_obj: score_list.append(obj.computeScore(rules)) #Normalize observations whitened = scv.whiten(score_list) #Compute Kmeans on the set of observations #centroids contains the center of each cluster centroids, _ = scv.kmeans(whitened, num_cluster) #Assign each sample to a cluster idx,_ = scv.vq(whitened, centroids) #Get index that will sort centroids rank = np.argsort(centroids) #Map a centroid to a rank rank_mapping = dict(zip([c for c in centroids], rank)) clustered = {} i = 0 for obj in list_obj: cluster_of_obs = idx[i] centroid = centroids[cluster_of_obs] #map url to rank clustered[obj.url] = rank_mapping[centroid] i += 1 sorted_cluster = sorted(clustered.iteritems(), key=operator.itemgetter(1)) return sorted_cluster
def initialize(self, poses, rest_pose, num_bones, iterations, mayaMesh=None, jointList=None): bones = [] num_verts = rest_pose.shape[0] # shape mean array scale num_poses = poses.shape[0] bone_transforms = np.empty( (num_bones, num_poses, 4, 3)) # [(R, T) for for each pose] for each bone # 3rd dim has 3 rows for R and 1 row for T # Use k-means to assign bones to vertices whitened = whiten(rest_pose) codebook, _ = kmeans(whitened, num_bones) rest_pose_corrected = np.empty( (num_bones, num_verts, 3)) # Rest pose - mean of vertices attached to each bone # confirm mode if mayaMesh: #rigid Skin vert_assignments, bones = self.manual_codebook(mayaMesh, jointList) boneArray = [] for i in bones: boneArray.append(cmds.xform(i, q=1, t=1, ws=1)) self.rest_bones_t = np.array(boneArray) #rest_bones_t = np.empty((num_bones , 3)) for bone in range(num_bones): #rest_bones_t[bone] = np.mean(rest_pose[vert_assignments == bone] , axis = 0) self.rest_bones_t[bone] = np.array(boneArray[bone]) rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone] for pose in range(num_poses): bone_transforms[bone, pose] = self.kabsch( rest_pose_corrected[bone, vert_assignments == bone], poses[pose, vert_assignments == bone]) else: # Compute initial random bone transformations vert_assignments, _ = vq( whitened, codebook) # Bone assignment for each vertex (|num_verts| x 1) self.rest_bones_t = np.empty( (num_bones, 3)) # Translations for bones at rest pose for bone in range(num_bones): self.rest_bones_t[bone] = np.mean( rest_pose[vert_assignments == bone], axis=0) rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone] for pose in range(num_poses): bone_transforms[bone, pose] = self.kabsch( rest_pose_corrected[bone, vert_assignments == bone], poses[pose, vert_assignments == bone]) for it in range(iterations): # Re-assign bones to vertices using smallest reconstruction error from all poses constructed = np.empty( (num_bones, num_poses, num_verts, 3)) # |num_bones| x |num_poses| x |num_verts| x 3 for bone in range(num_bones): Rp = bone_transforms[bone, :, :3, :].dot( (rest_pose - self.rest_bones_t[bone]).T).transpose( (0, 2, 1)) # |num_poses| x |num_verts| x 3 # R * p + T constructed[bone] = Rp + bone_transforms[bone, :, np.newaxis, 3, :] errs = np.linalg.norm(constructed - poses, axis=(1, 3)) # position value average vert_assignments = np.argmin(errs, axis=0) # For each bone, for each pose, compute new transform using kabsch for bone in range(num_bones): self.rest_bones_t[bone] = np.mean( rest_pose[vert_assignments == bone], axis=0) rest_pose_corrected[bone] = rest_pose - self.rest_bones_t[bone] for pose in range(num_poses): P = rest_pose_corrected[bone, vert_assignments == bone] Q = poses[pose, vert_assignments == bone] if (P.size == 0 or Q.size == 0): print 'Skip Iteration' else: bone_transforms[bone, pose] = self.kabsch(P, Q) # jointList is correct Index Joint return bone_transforms, self.rest_bones_t, bones
def search(query, n=40, start=0): # retrieve top n results of query # default is 40 results per page dict_res = BossImageIndex().CallBoss(query, n, start) im_res = dict_res['ysearchresponse']['resultset_images'] res = [] for i in xrange(n): res.append((im_res[i]['thumbnail_url'], i)) #path_name = "/Library/WebServer/results/"+query path_name = "/Users/novi/my_image_search/results/" + query # create the folder (if does not exist) to save query results if os.path.isdir(path_name): shutil.rmtree(path_name) os.mkdir(path_name) else: os.mkdir(path_name) # download the image results image = urllib.URLopener() silentcounter = 1 imagefile = [] for counter in xrange(n): urltoberetrieved = res[counter][0] #print urltoberetrieved filename = '%s/%s.%s' % (path_name, silentcounter, 'jpg') #try: image.retrieve(urltoberetrieved, filename) imagefile.append(filename) silentcounter = silentcounter + 1 #except IOError: # print 'error at %s \n' % (urltoberetrieved) # pass # prepare the color image feature pref = numpy.array([[0, 0]]) # [image #,position #] ldesc = [] codes = 30 #number of k-means cluster ino = 5 jno = 8 # default grid: 5 by 8 2D grid show = ino * jno lim = show silentcounter = 1 for i_img in xrange(lim): fname = imagefile[i_img] try: im = cv.LoadImage(fname, 0) # loading with OpenCV (gray chanel only) silentcounter = silentcounter + 1 except: print 'image thumbnail can not be retrieved' sys.exit(0) #resizing the image #om = cv.CreateImage((psize,psize),im.depth,im.nChannels) #cv.Resize(im,om,cv.CV_INTER_CUBIC) storage = cv.CreateMemStorage(0) #generating the mask #mat = cv.CreateMat(psize,psize,cv.CV_8UC1) #extracting SURF feature #[keypoints,descriptors] = cv.ExtractSURF(om,mat,storage,(1,500,3,4)) [keypoints, descriptors] = cv.ExtractSURF(im, im, storage, (1, 500, 3, 4)) ldesc.append(descriptors) #perform vector quantization tarrdesc = [numpy.array(ldesc[i]) for i in range(show)] lendesc = [ldesc[i].__len__() for i in range(show)] arrdesc = numpy.concatenate([tarrdesc[i] for i in range(show)]) arrdesc = whiten(arrdesc) [codebook, distortion] = kmeans(arrdesc, codes) [code, dist] = vq(arrdesc, codebook) #generate the semantic feature imgdata = numpy.zeros((show, codebook.shape[0]), dtype=float) code_offset = 0 for i_img in xrange(show): code_index = range(code_offset, code_offset + lendesc[i_img]) for i_code in code_index: imgdata[i_img, code[i_code]] = imgdata[i_img, code[i_code]] + 1 code_offset = code_offset + lendesc[i_img] #normalize the semantic feature sumimgdata = numpy.sum(imgdata, axis=1) sumimgdata.shape = show, 1 imgdata = imgdata / sumimgdata griddata = numpy.zeros((2, ino * jno)) griddata[0, ] = numpy.kron(range(1, ino + 1), numpy.ones((1, jno))) griddata[1, ] = numpy.tile(range(1, jno + 1), (1, ino)) # do kernelized sorting procedure PI = KS(imgdata, griddata.T, pref) i_sorting = PI.argmax(axis=1) #creating the passed dictionary sorted_dict_res = {} sorted_dict_res['count'] = dict_res['ysearchresponse']['count'] sorted_dict_res['totalhits'] = dict_res['ysearchresponse']['totalhits'] sorted_dict_res['start'] = dict_res['ysearchresponse']['start'] sorted_dict_res['resultset_images'] = [ dict_res['ysearchresponse']['resultset_images'][i] for i in i_sorting ] return sorted_dict_res
topic_word = model.topic_word_ for i in range(len(doc_topic)): topic_most_pr = doc_topic[i].argsort() keywords = [topic_word[topic_most_pr[n]].argmax() for n in range(K)] ##话题中概率最大的词 print('*keywords {}'.format([word[n] for n in keywords])) ### cluster candidates words by topic/svd from scipy import spatial from scipy.cluster.vq import kmeans, vq, whiten word_topic = topic_word.transpose() # 词-话题向量 for n in range(len(doc_topic)): keywords = [] data = [(w, word.index(w)) for w in candidates[n] if w in word] cand_vec = word_topic[[w[1] for w in data], :] # 候选词-话题向量 centroids, _ = kmeans(whiten(cand_vec), K) for i in range(K): min_dist = 100 near_word = -1 for j in range(len(cand_vec)): a = np.dot(centroids[i, :], cand_vec[j, :]) if a <= min_dist and j not in keywords: min_dist = a near_word = j keywords.append(near_word) keywords = [data[w][0] for w in keywords] print('*keywords {}'.format(keywords))
'''FIFA 18 is a football video game that was released in 2017 for PC and consoles. The dataset that you are about to work on contains data on the 1000 top individual players in the game. You will explore various features of the data as we move ahead in the course. In this exercise, you will work with two columns, eur_wage, the wage of a player in Euros and eur_value, their current transfer market value. The data for this exercise is stored in a Pandas dataframe, fifa. whiten from scipy.cluster.vq and matplotlib.pyplot as plt have been pre-loaded.''' import pandas as pd from scipy.cluster.vq import whiten from matplotlib import pyplot as plt fifa = pd.read_csv( '/Users/vaibhav/Desktop/Python Projects/DataCamp-Data Scientist with python/26-Cluster Analysis in Python /Introduction to Clustering /fifa.csv', index_col=0) # Scale wage and value fifa['scaled_wage'] = whiten(fifa['eur_wage']) fifa['scaled_value'] = whiten(fifa['eur_value']) # Plot the two columns in a scatter plot fifa.plot(x='scaled_wage', y='scaled_value', kind='scatter') plt.show()
data = json.load(f) print(type(data)) i = 0 usr_prob = np.ndarray((1334, 5)) for usr in data: usr_prob[i] = usr["problems"] i += 1 # print(i) # print(usr_prob) # print(data) # usr_data = np.array(usr_prob) x, y = kmeans2(whiten(usr_prob), 5, iter=20) # y.dtype = np.int64 print(type(y)) y = y.tolist() # print(x) print(y[0]) print(len(y)) # # # print(data["coordinates"]["lat"]) # j = 0 locn_cluster = {"lat": [], "long": [], "cluster": []} # print(len(y))
import scipy.cluster.hierarchy as sch from scipy.cluster.vq import vq,kmeans,whiten import numpy as np import matplotlib.pylab as plt #待聚类的数据点,cancer.csv有653行数据,每行数据有11维: dataset = np.loadtxt('cancer.csv', delimiter=",") #np数据从0开始计算,第0维维序号排除,第10维为标签排除,所以为1到9 points = dataset[:,1:9] cancer_label = dataset[:,10] print "points:\n",points print "cancer_label:\n",cancer_label # k-means聚类 #将原始数据做归一化处理 data=whiten(points) #使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k. #有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值. #k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0] #centroid = kmeans(data,max(cluster))[0] centroid = kmeans(data,2)[0] print centroid #使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label label=vq(data,centroid)[0] num = [0,0] for i in label: if(i == 0): num[0] = num[0] + 1 else: num[1] = num[1] + 1 print 'num =',num
#and if our contour aspect ratio is "long" we draw the bounding box #note that this just checks the vert/horiz direction at present. x,y,w,h = cv2.boundingRect(c) vert_aspect_ratio = float(h)/w horiz_aspect_ratio = float(w)/h #print 'x ' + str(x) + ' y ' + str(y) + ' w ' + str(w) + ' h ' + str(h) #if horiz_aspect_ratio > 2 or vert_aspect_ratio > 2: #cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,255),2) #draw in all contours to see how they fall #contour_sizes.append([float(x)*4,float(y)*4,max(float(w),float(h))])#,horiz_aspect_ratio,vert_aspect_ratio]) contour_sizes.append([cx*8.0,cy*8.0,max(float(w),float(h))/8.0])#,horiz_aspect_ratio,vert_aspect_ratio]) contour_lookup.append(c) #contour_sizes.append([float(x),float(w),float(h)])#,horiz_aspect_ratio]) #cv2.drawContours(img,[c],0,(0,255,0),1) whitened_contour_sizes = clustering.whiten(contour_sizes) #print str(contour_sizes) # let scipy do its magic (k==3 groups) centers,dist = clustering.kmeans(whitened_contour_sizes,75,iter=100) code, distance = clustering.vq(whitened_contour_sizes,centers) #print str(centroid) #print str(code) #print 'contours is ' + str(len(contour_sizes)) + ' and code is ' + str(len(code)) colors = [( int(random.uniform(0, 255)),int(random.uniform(0, 255)),int(random.uniform(0, 255))) for i in code ] #print str(colors) for i, label in enumerate(code): color = colors[label] x,y,w,h = cv2.boundingRect(contour_lookup[i])
# -*- coding: utf-8 -*- """ Created on Thu Mar 14 02:02:38 2019 @author: js """ import numpy as np from scipy.cluster.vq import vq, kmeans, whiten list1 = [88.0, 74.0, 96.0, 85.0] list2 = [92.0, 99.0, 95.0, 94.0] list3 = [91.0, 87.0, 99.0, 95.0] list4 = [78.0, 99.0, 97.0, 81.0] list5 = [88.0, 78.0, 98.0, 84.0] list6 = [100.0, 95.0, 100.0, 92.0] data = np.array([list1,list2,list3,list4,list5,list6]) whiten = whiten(data) centroids,_ = kmeans(whiten, 2) result,_= vq(whiten, centroids) print(result)
def analyze_color(input_image, transparency_threshold=50, plot_3d=False, plot_bar=True, n_cluster=None, max_cluster=10, ignore_pure_black=True, use_sample=True, return_colors=True): # Copy to prevent modification (useful but mechanism needs clarification) input_image = input_image.copy() # Check input shape assert (len(input_image.shape) == 3) assert (input_image.shape[-1] in {3, 4}) # Turn color info of pixels into dataframe, filter by transparency if RGBA image is passed if input_image.shape[-1] == 4: color_df = pd.DataFrame(input_image.reshape(-1, 4), columns=list('rgba')) # Get the rgb info of pixels in the non-transparent part of the image color_df = color_df[color_df['a'] >= transparency_threshold] if input_image.shape[-1] == 3: color_df = pd.DataFrame(input_image.reshape(-1, 3), columns=list('rgb')) if ignore_pure_black: color_df = color_df[~((color_df['r'] == 0) & (color_df['g'] == 0) & (color_df['b'] == 0))] # Handle large pixel color_df if not use_sample and len(color_df) > 1e5: sample_or_not = (input( 'Large image detected, would you like to sample the pixels in this image? (Y/N) ' )).lower()[0] == 'y' if sample_or_not: print( 'Sampled 100,000 pixels from the image, note that you can also resize the image before passing it to this function.' ) color_df = color_df.sample(n=int(1e5), random_state=0) else: print( 'Not sampling performed, but note that rendering 3D plot for the pixels may crash your session and K-means clustering will be slow.' ) # Get std for reverse-transform the kmeans results to a meaningful rgb palette r_std, g_std, b_std = color_df[list('rgb')].std() reverse_whiten_array = np.array((r_std, g_std, b_std)) # Normalize observations on a per feature basis, forcing features to have unit variance # Doc: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.vq.whiten.html for color in list('rgb'): color_df['scaled_' + color] = whiten(color_df[color]) ## 3D scatter plot showing color groups if plot_3d: trace = go.Scatter3d( x=color_df['r'], y=color_df['g'], z=color_df['b'], mode='markers', marker=dict(color=[ 'rgb({},{},{})'.format(r, g, b) for r, g, b in zip(color_df['r'].values, color_df['g'].values, color_df['b'].values) ], size=1, opacity=1)) layout = go.Layout(margin=dict(l=0, r=0, b=0, t=0)) fig = go.Figure(data=[trace], layout=layout) fig.show() ## Use K-means to identify main colors cluster_centers_list = [] avg_distortion_list = [] if n_cluster != None: n_cluster_range = [n_cluster - 1] # note minus 1 to get exactly n else: n_cluster_range = range(max_cluster + 1) if plot_bar: # Initialize plt graph f, ax = plt.subplots(len(n_cluster_range), 1, figsize=(10, 10)) for n in n_cluster_range: ###### Train clusters ###### cluster_centers, avg_distortion = kmeans( color_df[['scaled_r', 'scaled_g', 'scaled_b']], n + 1) ###### Assign labels ###### labels, distortions = vq( color_df[['scaled_r', 'scaled_g', 'scaled_b']], cluster_centers) color_df['label'] = labels color_df['distortion'] = distortions ###### Build palette ###### # These parameter affects visual style only and can be exposed to user later height = 200 width = 1000 gap_size = 5 palette = np.zeros((height, width, 3), np.uint8) # Count how many pixels falls under which category, let this decides the color's relative width in the palette cluster_proportion = color_df['label'].value_counts().sort_index( ) / len(color_df) cluster_width_list = (cluster_proportion * width).to_list() cluster_width_list = [ int(x) for x in saferound(cluster_width_list, places=0) ] # Reorder clusters and widths according to the proportion, largest to smallest reordered_cluster_df = pd.DataFrame( zip(cluster_centers, cluster_width_list), columns=['cluster', 'width']).sort_values('width', ascending=False) cluster_centers = reordered_cluster_df['cluster'].tolist() cluster_width_list = reordered_cluster_df['width'].tolist() # Storing information cluster_centers_list.append(cluster_centers) avg_distortion_list.append(avg_distortion) if plot_bar: # Coloring the palette canvas based on color and width endpoints = list(np.cumsum(cluster_width_list)) startpoints = [0] + endpoints[:-1] for cluster_index in range(len(cluster_centers)): # Notice here we apply the reverse_whiten_array to get meaningful RGB colors palette[:, startpoints[cluster_index] + gap_size: endpoints[cluster_index], :] = cluster_centers[ cluster_index] * reverse_whiten_array palette[:, startpoints[cluster_index]:startpoints[cluster_index] + gap_size, :] = (255, 255, 255) # Displaying the palette when performing K-means with parameter n if n_cluster != None: ax.imshow(palette) ax.axis('off') else: ax[n].imshow(palette) ax[n].axis('off') if plot_bar: ### Show the entire palette f.tight_layout() plt.show() ### Show the elbow plot for choosing best n_cluster parameter for K-means fig = plt.figure() plt.scatter(x=n_cluster_range, y=avg_distortion_list) fig.suptitle('Elbow Plot for K-means') plt.xlabel('Number of Clusters') plt.ylabel('Average Distortion') print() if return_colors: if n_cluster != None: return (cluster_centers_list[0] * reverse_whiten_array).astype( np.uint8) else: return [(cluster_centers * reverse_whiten_array).astype(np.uint8) for cluster_centers in cluster_centers_list]
def clusterSounds(targetDir, nCluster=-1, descInput=[]): """ This function clusters all the sounds in targetDir using kmeans clustering. Input: targetDir (string): Directory where sound descriptors are stored (all the sounds in this directory will be used for clustering) nCluster (int): Number of clusters to be used for kmeans clustering. descInput (list) : List of indices of the descriptors to be used for similarity/distance computation (see descriptorMapping) Output: Prints the class of each cluster (computed by a majority vote), number of sounds in each cluster and information (sound-id, sound-class and classification decision) of the sounds in each cluster. Optionally, you can uncomment the return statement to return the same data. """ dataDetails = fetchDataDetails(targetDir) ftrArr = [] infoArr = [] if nCluster == -1: nCluster = len(dataDetails.keys()) for cname in dataDetails.keys(): #iterating over sounds for sname in dataDetails[cname].keys(): ftrArr.append( convFtrDict2List( dataDetails[cname][sname]['feature'])[descInput]) infoArr.append([sname, cname]) ftrArr = np.array(ftrArr) infoArr = np.array(infoArr) ftrArrWhite = whiten(ftrArr) centroids, distortion = kmeans(ftrArrWhite, nCluster) clusResults = -1 * np.ones(ftrArrWhite.shape[0]) for ii in range(ftrArrWhite.shape[0]): diff = centroids - ftrArrWhite[ii, :] diff = np.sum(np.power(diff, 2), axis=1) indMin = np.argmin(diff) clusResults[ii] = indMin ClusterOut = [] classCluster = [] globalDecisions = [] for ii in range(nCluster): ind = np.where(clusResults == ii)[0] freqCnt = [] for elem in infoArr[ind, 1]: freqCnt.append(infoArr[ind, 1].tolist().count(elem)) indMax = np.argmax(freqCnt) classCluster.append(infoArr[ind, 1][indMax]) print( "\n(Cluster: " + str(ii) + ") Using majority voting as a criterion this cluster belongs to " + "class: " + classCluster[-1]) print("Number of sounds in this cluster are: " + str(len(ind))) decisions = [] for jj in ind: if infoArr[jj, 1] == classCluster[-1]: decisions.append(1) else: decisions.append(0) globalDecisions.extend(decisions) print("sound-id, sound-class, classification decision") ClusterOut.append(np.hstack((infoArr[ind], np.array([decisions]).T))) print(ClusterOut[-1]) globalDecisions = np.array(globalDecisions) totalSounds = len(globalDecisions) nIncorrectClassified = len(np.where(globalDecisions == 0)[0]) print( "Out of %d sounds, %d sounds are incorrectly classified considering that one cluster should " "ideally contain sounds from only a single class" % (totalSounds, nIncorrectClassified)) print( "You obtain a classification (based on obtained clusters and majority voting) accuracy " "of %.2f percentage" % round( float(100.0 * float(totalSounds - nIncorrectClassified) / totalSounds), 2))
k = 64 #24 for ko in range(numpat): kxOn = conv.kxPos(ko, nx, ny, nf) kyOn = conv.kyPos(ko, nx, ny, nf) p = w.next_patch() if marginstart < kxOn < marginend: if marginstart < kyOn < marginend: acount = acount + 1 if kxOn == margin + 1 and kyOn == margin + 1: d = p else: d = np.vstack((d,p)) wd = sp.whiten(d) result = sp.kmeans2(wd, k) cluster = result[1] k2 = k / 2 nx_im = 2 * (nxp + space) + space ny_im = k2 * (nyp + space) + space im = np.zeros((nx_im, ny_im)) im[:,:] = (w.max - w.min) / 2. nx_im2 = nx * (nxp + space) + space ny_im2 = ny * (nyp + space) + space
from scipy.cluster.vq import kmeans, vq, whiten from numpy import vstack, array from numpy.random import rand # data generation with three features data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3))) print(data) # whitening of data data = whiten(data) print(data) # computing K-Means with K = 3 (2 clusters) print("-------------computing K-Means with K = 3 (2 clusters)--------------") centroids, _ = kmeans(data, 3) print(centroids) # assign each sample to a cluster clx, _ = vq(data, centroids) # check clusters of observation print(clx)
print('Processed ' + str(fileCount) + ' files') print('Preprocessing...') # preprocess data allWords = list() for i in range(0, len(docs)): for j in range(0, len(docs[i])): docs[i][j] = wmd.preproc(docs[i][j]) allWords += docs[i][j] dict = Dictionary(documents=[allWords]) # create nbow vectors print('Computing vectors...') vectors = list() for i in range(0, len(docs)): for j in range(0, len(docs[i])): vector = whiten(KeyedVectors.nbow( docs[i][j], dict)) # whitening increases accuracy # vector = KeyedVectors.nbow(docs[i][j], dict) vector = np.append(vector, i) # add class column vectors.append(vector) df = pd.DataFrame(vectors) # print('DATA FRAME:\n' + str(df)) # split data to train and test sets random_indices = permutation(df.index) test_cutoff = math.floor(len(df) / 5) # 80:20 ratio test_set = df.loc[random_indices[1:test_cutoff]] train_set = df.loc[random_indices[test_cutoff:]] # choose x and y sets (x - vectors columns; y - class) x_columns = list(range(0, len(vector) - 1))
def KMEANS(self): # clusters K = 3 data_arr = [] meal_name_arr = [] with open('./NewDataSet/Cluster_dataset/clusterisbnrate.csv', 'rb') as f: reader = csv.reader(f) for row in reader: if reader.line_num != 1: '''for x in row[2:]: print x''' data_arr.append([float(x) for x in row[1:]]) meal_name_arr.append([row[0]]) data = vstack(data_arr) print "data :" print data meal_name = vstack(meal_name_arr) # normalization data = whiten(data) # computing K-Means with K (clusters) centroids, distortion = kmeans(data, 3) print "distortion = " + str(distortion) # assign each sample to a cluster cntr = [] print("Centroids:") print centroids cntr = centroids print("Cntr :") print cntr print "---------------------------------------------------------" print("Centroids after sort:") #centroids=cntr.sort() #print centroids.sort() print "---------------------------------------------------------" idx, _ = vq(data, centroids) print "idx:" print idx print "-----------------------------------------------------------" '''# some plotting using numpy's logical indexing plot(data[idx==0,0], data[idx==0,1],'ob', data[idx==1,0], data[idx==1,1],'or', data[idx==2,0], data[idx==2,1],'og')''' print meal_name print data for i in range(K): print centroids[i] * 3 #print round(centroids[i]) print "max value:" max1 = max(centroids) print "min value:" min1 = min(centroids) toprated = [] lowrated = [] medrated = [] for i in range(K): result_names = meal_name[idx == i, 0] print "=================================" vv = round(centroids[i]) print vv name = "" print "Cluster " + str(i + 1) for name1 in result_names: name = name1 print name1 '''if(i== 0) : f1.write(name) elif (i==1): f2.write(name) elif (i==2): f3.write(name)''' if (centroids[i] == max1): #for name1 in result_names: toprated.append(name) name = name1 + "\n" f1.write(name) elif (centroids[i] == min1): lowrated.append(name) name = name1 + "\n" f3.write(name) else: medrated.append(name) name = name1 + "\n" f2.write(name) print "--------------------------------------------------------------------------" print "toprated:" print toprated print "--------------------------------------------------------------------------" print "medrated:" print medrated print "--------------------------------------------------------------------------" print "lowrated:" print lowrated print "--------------------------------------------------------------------------" '''plot(centroids[:,0], centroids[:,1], 'sg',markersize=8)''' show()
xycoords='data', arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.5', color='k', alpha=0.8)) plt.show() # In[4]: # N is the number of observations to group into k clusters N = len(coordinates) # normalize the coordinate data with the whiten function # each feature is divided by its standard deviation across all observations to give it unit variance. w = whiten(coordinates) # k is the number of clusters to form k = 100 # i is the number of iterations to perform i = 50 # In[5]: # performs k-means on a set of observation vectors forming k clusters # returns a k-length array of cluster centroid coordinates, and the final distortion cluster_centroids1, distortion = kmeans(w, k, iter=i) # plot the cluster centroids plt.figure(figsize=(10, 6), dpi=100)
from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer( min_df=20, stop_words='english') #该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() #该类会统计每个词语的tf-idf权值 tfidf = transformer.fit_transform(vectorizer.fit_transform( documents_words)) #fit_transform计算tf-idf,fit_transform将文本转为词频矩阵 word = vectorizer.get_feature_names() #获取词袋模型中的所有词语 features = tfidf.toarray() #将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 target = [c for (d, c) in documents] data = whiten(features) centroids, _ = kmeans(data, 2) idx, _ = vq(data, centroids) target1 = [1 if x == 'pos' else 0 for x in target] a = sum(target1 == idx) / len(target1) print('scipy_eu=', max(a, 1 - a)) from nltk.cluster import KMeansClusterer, cosine_distance clus = KMeansClusterer(2, cosine_distance) results = clus.cluster(data, True, trace=False) a = sum(np.array(target1) == results) / len(target1) print('nltk_cosdis=', max(a, 1 - a)) from Bio.Cluster import kcluster
test_data = pd.read_csv("data/test.csv") data = pd.concat((train_data, test_data), axis=0, ignore_index=True) data.drop(['ID', 'v22'], axis=1, inplace=True) data.fillna(0, inplace=True) catagorical_features = [] numeric_features = [] for f in data.columns: if data[f].dtype == 'object': catagorical_features.append(f) else: numeric_features.append(f) data_num = whiten(data[numeric_features]) data_cat = pd.get_dummies(data[catagorical_features], columns=catagorical_features) trlen = train_data.shape[0] train = np.hstack((data_num[:trlen], data_cat[:trlen])) test = np.hstack((data_num[trlen:], data_cat[trlen:])) labels = label_data.astype(int) xtrain, xtest, ytrain, ytest = train_test_split(train, labels, train_size=0.7) model = skflow.TensorFlowDNNClassifier(hidden_units=[128, 128, 128], learning_rate=0.01, n_classes=2, batch_size=128, steps=10000)
# Make sure we have the primefac-fork try: import primefac # pylint: disable=W0611 # NOQA except ImportError: raise ImportError('Need to install fork of primefac: ' 'https://github.com/elliptic-shiho/' 'primefac-fork') # Radially sampled Shepp-Logan N, spokes, nc = 288, 72, 8 kx, ky = radial(N, spokes) kx = np.reshape(kx, (N, spokes), 'F').flatten() ky = np.reshape(ky, (N, spokes), 'F').flatten() k = kspace_shepp_logan(kx, ky, ncoil=nc) k = whiten(k) # whitening seems to help conditioning of Gx, Gy # Put in correct shape for radialgrappaop k = np.reshape(k, (N, spokes, nc)) kx = np.reshape(kx, (N, spokes)) ky = np.reshape(ky, (N, spokes)) # Get the GRAPPA operators! t0 = time() Gx, Gy = radialgrappaop(kx, ky, k) print('Gx, Gy computed in %g seconds' % (time() - t0)) # Put in correct order for GROG kx = kx.flatten() ky = ky.flatten() k = np.reshape(k, (-1, nc))
def train_categorical_feature(feature_input, outcome, limit, number_of_clusters): input = feature_input.values if len(pd.unique(input)) == 2: vocabulary = np.unique(input) p = np.array([0, 1]) d = np.zeros(len(input), dtype=np.int) d[input == vocabulary[1]] = 1 output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p])) print output return output vocabulary_t = pd.unique(input) count_1 = np.zeros(len(vocabulary_t), dtype=int) count_0 = np.copy(count_1) outcome_1 = outcome.values == 1 outcome_0 = outcome.values == 0 for index, item in enumerate(vocabulary_t): if pd.notnull(item): count_1[index] = sum((input == item) * (outcome_1)) count_0[index] = sum((input == item) * (outcome_0)) else: count_1[index] = sum(pd.isnull(input) * (outcome_1)) count_0[index] = sum(pd.isnull(input) * (outcome_0)) condition = (count_0 + count_1) >= limit condition[pd.isnull(vocabulary_t)] = True # n = sum(condition) # vocabulary = np.zeros(n, dtype = str) # p = np.zeros(n) def log_ratio(count_1, count_0): if count_1 == 0: return log(1 / (2 * float(count_0))) elif count_0 == 0: return log(2 * count_1) else: return log(count_1 / float(count_0)) v_log_ratio = np.vectorize(log_ratio) vocabulary = vocabulary_t[condition] p = v_log_ratio(count_1[condition], count_0[condition]) # index = 0 # for i in range(len(vocabulary_t)): # if (condition[i]): # vocabulary[index] = str(vocabulary_t[index]) # p[index] = log_ratio(count_1[index], count_0[index]) # index = index + 1 # if (count_1[index] == 0): # p[index] = log(1./(2*count_0[index])) # elif (count_0[index] == 0): # p[index] = log(2*count_1[index]) # else: # p[index] = log(count_1[index]./count_0[index]) # print "sum(condition == 0) is {0}".format(sum(condition == 0)) if sum(condition == 0) <= 1: if sum(condition == 0) == 1: p = np.append( p, log_ratio(count_1[condition == 0][0], count_0[condition == 0][0])) # if (count_1[condition == 0][0] == 0): # p[condition == 0] = log(1./(2*count_0[condition == 0][0])) # elif (count_0[condition == 0] == 0): # p[condition == 0] = log(2*count_1[condition == 0][0]) # else: # p[condition == 0] = log(count_1[condition == 0][0]./count_0[condtion == 0][0]) vocabulary = np.append(vocabulary, vocabulary_t[condition == 0]) else: # print "number of clusters {0}".format(number_of_clusters) cl = min(number_of_clusters, sum(condition == 0) - 1) # why is it -1 here? # cl_vocabulary = pd.DataFrame() # print "cl {0}".format(cl) residual_1 = count_1[condition == 0] residual_0 = count_0[condition == 0] # print "length of the residual_1 {0}".format(len(residual_1)) # s = np.zeros(len(residual_1)) s = v_log_ratio(residual_1, residual_0).reshape([len(residual_1), 1]) whitened = whiten(s) codebook = kmeans(whitened, cl)[0] code = vq(whitened, codebook)[0] # print "length of code {0}".format(len(code)) s1 = pd.Series(data=vocabulary_t[condition == 0]) # .astype(str) s2 = pd.Series(data=code) cl_vocabulary = pd.DataFrame.from_dict({ "cat_feature_input": s1, "cluster_id": s2 }) #print cl_vocabulary.axes cl_p = np.zeros(cl, dtype=float) # print cl_p, len(cl_p) for i in range(cl): # print i c1 = residual_1[code == i] c0 = residual_0[code == i] cl_p[i] = log_ratio(sum(c1), sum(c0)) # print "Hey" d = np.zeros(len(input)) d[pd.isnull(input)] = p[pd.isnull(vocabulary)] for i in range(len(vocabulary)): d[input == vocabulary[i]] = p[i] vocabulary = vocabulary.astype(str) if 'cl_vocabulary' in locals(): print "cl_vocabulary in locals()" for i in range(len(cl_vocabulary)): d[input == cl_vocabulary.loc[i, "cat_feature_input"]] = cl_p[ cl_vocabulary.loc[i, "cluster_id"]] #print cl_vocabulary.axes cl_vocabulary.loc[:, "cat_feature_input"] = cl_vocabulary[ "cat_feature_input"].astype(str) # print cl_vocabulary["cat_feature_input"].apply(type) output = dict( zip(["d", "vocabulary", "cl_vocabulary", "p", "cl_p"], [d, vocabulary, cl_vocabulary, p, cl_p])) else: output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p])) #print output return output
line = segfile.readline() index = 0 while line != '': tokens = line.split(',') nums = map(float, tokens) nums = nums[2:len(line)] # Omit recid and segid if index == 0: segfeatures = nums else: segfeatures = np.vstack((segfeatures, nums)) line = segfile.readline() index += 1 segfeatures = whiten(segfeatures) kmeans1 = cluster.KMeans(n_clusters=k, init='k-means++', n_init=50, max_iter=300, random_state=rseed) kmeans2 = cluster.KMeans(n_clusters=kextra, init='k-means++', n_init=50, max_iter=300, random_state=rseed) clusters1 = kmeans1.fit_predict(segfeatures) clusters2 = kmeans2.fit_predict(segfeatures)
infiles = glob.glob('image_vectors/*.npz') # build ann index #t = AnnoyIndex(dims) for file_index, i in enumerate(infiles): file_vector = np.loadtxt(i) file_name = os.path.basename(i).split('.')[0] file_index_to_file_name[file_index] = file_name file_index_to_file_vector[file_index] = file_vector #whitened = whiten(file_vector) #t.add_item(file_index, file_vector) #t.build(trees) whitened = whiten(features) codes = 3 result = kmeans(whitened, codes) ''' # create a nearest neighbors json file for each input if not os.path.exists('nearest_neighbors'): os.makedirs('nearest_neighbors') for i in file_index_to_file_name.keys(): master_file_name = file_index_to_file_name[i] master_vector = file_index_to_file_vector[i] named_nearest_neighbors = [] nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors) for j in nearest_neighbors:
def displayResult(): noOfCluster=0 #Get Radio button input to check user choice chart = request.form['radio'] #If user choice is cluster if chart == 'cluster': noOfCluster =long(request.form['cluster']) data_arr = [] meal_name_arr = [] #Url of data csv url='https://storage.googleapis.com/cloudbucket786/imptry4.csv' response=urllib2.urlopen(url) reader = csv.reader(response) for row in reader: if row[5] is None: row[5]=0 if row[5]=='': row[5]=0 if "," in row[6] : rowVal=row[6].split(",") row[6]=rowVal[0]+''+rowVal[1] row[6]=float(row[6]) if row[6]=='': row[6]=0 if row[6]=='N' : row[6]=0 if "," in row[7] : rowVal=row[7].split(",") row[7]=rowVal[0]+''+rowVal[1] row[7]=float(row[6]) if row[7]=='': row[7]=0 if row[7]=='N' : row[7]=0 data_arr.append([float(x) for x in row[5:]])#adding data to data_array meal_name_arr.append([row[0]])#adding ids to second array #print data_arr fig = plt.figure() ax = fig.add_subplot(111, projection='3d')#We are using 3D projection as we are plotting 3D data data = vstack( data_arr ) meal_name = vstack(meal_name_arr) # normalization data = whiten(data)#Before running k-means, it is beneficial to rescale each feature dimension of the observation set with whitening. #Each feature is divided by its standard deviation across all observations to give it unit variance. # computing K-Means with K (clusters) centroids, distortion = kmeans(data,noOfCluster) # assign each sample to a cluster idx,_ = vq(data,centroids) # some plotting using numpy's logical indexing listOfColor=['ob','or','og','oc','om','ok','oy'] for index in range(noOfCluster): plot(data[idx==index,0], data[idx==index,1],data[idx==index,2],listOfColor[index])# using 3 objects for 3D projection for index in range(noOfCluster): result_names = meal_name[idx==index, 0] print "=================================" print "Cluster " + str(index+1) for name in result_names: print name plot(centroids[:,0], centroids[:,1], centroids[:,2], 'oy',markersize=8) #saving file to temp image #Assigning labels to axis ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') pylab.savefig('temp.jpg') pylab.clf() image="https://www.pythonanywhere.com/user/abhitej/files/home/abhitej/temp.jpg" #Overwrites the image on pythonanywhere.com return render_template('home.html',image=image,display='display:block;') else: list=[] words=request.form['words'] list=words.split(",") list1=[] for s in list: list1.append(s.encode('ascii','ignore')) return render_template('home.html',list1=list1,display='display:none;')# Assigning display none for cluster if user chooce wordcloud
num_bins = len(use_features) * 11 all_data_orig = np.hstack( [all_data_orig_master[0, :, 0:3]] + [all_data_orig_master[AS_i, :, 3:] for AS_i in use_features]) # classifiers NN_classify = np.zeros(2) # 1st col mice | 2nd MDs LR_classify = np.zeros(2) GNB_classify = np.zeros(2) RF_classify = np.zeros(2) data = all_data_orig[:, 3:] labels = all_data_orig[:, 0:3] data = whiten(data) # "Z-score" train, labels_train, test, labels_test = split_data_in_half_randomly( data, labels) mice_train = day_to_mouse_average(train, labels_train) mice_test = day_to_mouse_average(test, labels_test) # NN vanilla classification strain_centers = mouse_to_strain_average(mice_train[:, 2:], mice_train[:, 0:2]) tot_cor = 0 for cnt, ms in enumerate(mice_test): min_dist = np.inf for k in range(strain_centers.shape[0]): distance = np.sqrt(((strain_centers[k] - ms[2:])**2).sum())
tokens = nltk.word_tokenize(ch_text.lower()) words = word_tokenizer.tokenize(ch_text.lower()) sentences = sentence_tokenizer.tokenize(ch_text) vocab = set(words) words_per_sentence = np.array( [len(word_tokenizer.tokenize(s)) for s in sentences]) fvs_lexical[e, 0] = words_per_sentence.mean() fvs_lexical[e, 1] = words_per_sentence.std() fvs_lexical[e, 2] = len(vocab) / float(len(words)) fvs_punct[e, 0] = tokens.count(',') / float(len(sentences)) fvs_punct[e, 1] = tokens.count(';') / float(len(sentences)) fvs_punct[e, 2] = tokens.count(':') / float(len(sentences)) fvs_lexical = whiten(fvs_lexical) fvs_punct = whiten(fvs_punct) NUM_TOP_WORDS = 10 all_tokens = nltk.word_tokenize(all_text) fdist = nltk.FreqDist(all_tokens) vocab = fdist.keys()[:NUM_TOP_WORDS] vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize) fvs_bow = vectorizer.fit_transform(chapters).toarray().astype(np.float64) fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)] def token_to_pos(ch): tokens = nltk.word_tokenize(ch) return [p[1] for p in nltk.pos_tag(tokens)]
def apply_kmeans(box_dict, k): # for every object class in the box_dict # reduce the list of boxes to the clustered boxes with kmeans # return the new dictionary kmeans_dict = dict() for obj_class in box_dict: print obj_class boxes = box_dict[obj_class] if len(boxes) > k: # write a representation for each proposal box as a vector def box_to_vec(pbox): # list of metrics which we want to reduce the Euclidean distance of: # includes centroid, and each of the individual coordinates of the box, # which are used to recover box coordinates after the k means in vector reprepresentation # are found. To weight the impact of the centroid measure, # we multiply by 1/area: the centroid matters less as box area increases. # we also include the coordinates, since distances between them are relevant as well. # Note that including the original coordinates in the vector allows us to recover the # original representation of the box. # we also include the score (scaled down) for the same reason. We scale it down since score-space # should not really affect the distance between boxes (having similar scores is not necessarily a good reason # to combine or not) metrics = [ pbox.centroid()[0], pbox.centroid()[1], pbox.centroid()[0] / pbox.area(), pbox.centroid()[1] / pbox.area(), pbox.x1, pbox.y1, pbox.x2, pbox.y2, 0.00001 * pbox.score ] return metrics # we will append the columns together and then take transpose # so that each row is a box with n features (here n = 9) first_col = box_to_vec(boxes[0]) # for rescaling oldx1, oldy1, oldx2, oldy2, oldscore = first_col[4], first_col[ 5], first_col[6], first_col[7], first_col[8] first_col = np.array(first_col) first_col = first_col.T box_mat = first_col for i in range(1, len(boxes)): new_col = np.array(box_to_vec(boxes[i])) new_col = new_col.T box_mat = np.c_[box_mat, new_col] box_mat = box_mat.T box_mat = box_mat.astype('float') # whiten box_mat = whiten(box_mat) # need to rescale the coords when we recover the boxes from the representation vectors newx1, newy1, newx2, newy2, newscore = 0, 0, 0, 0, 0 if len(np.shape(box_mat)) > 1: newx1, newy1, newx2, newy2, newscore = box_mat[0][4], box_mat[ 0][5], box_mat[0][6], box_mat[0][7], box_mat[0][8] else: newx1, newy1, newx2, newy2, newscore = box_mat[4], box_mat[ 5], box_mat[6], box_mat[7], box_mat[8] scalex1, scaley1, scalex2, scaley2, scalescore = oldx1 / ( 0. + newx1), oldy1 / (0. + newy1), oldx2 / ( 0. + newx2), oldy2 / (0. + newy2), oldscore / (0. + newscore) # use k-means codebook, distortion = kmeans(box_mat, k) centroid_boxes = [] for i in range(np.shape(codebook)[0]): # we chop off from 4 onwards because these are (pbox.x1, pbox.y1, pbox.x2, pbox.y2, pbox.score) # this is a direct inverse from box_to_vec # need to multiply these coords by standard deviations across all instances of feature. thebox = box(scalex1 * codebook[i][4], scaley1 * codebook[i][5], scalex2 * codebook[i][6], scaley2 * codebook[i][7], scalescore * codebook[i][8]) centroid_boxes.append(thebox) print "# of centroids: " + str(len(centroid_boxes)) print centroid_boxes[0] print centroid_boxes[1] print centroid_boxes[2] if obj_class not in kmeans_dict: kmeans_dict[obj_class] = [] kmeans_dict[obj_class] = centroid_boxes else: kmeans_dict[obj_class] = box_dict[obj_class] print "===================================" return kmeans_dict
def limb_track(): global frame_n cv.namedWindow("Dots") fps = 30 frame_dt = 0 #1.0 / fps mv_i = 0 pause = False while True: print("Frame:", mv_i) if frame_n >= contour_data.shape[0]: #mv_i = 0 print("Frames completed:", frame_n) f_write.save(write_dict) break t = time.clock() ret, im = cap.read() for x, y in fs: cv.circle(im, (x, y), 2, (255, 0, 0), -1) n = n_contours[mv_i] if (n > 0): c_points = contour_data[mv_i, :n] limb_distances = np.empty((num_limbs, n)) for i in range(num_limbs): limb_x, limb_y = fs[i] for j in range(n): x, y = c_points[j] dx = limb_x - x dy = limb_y - y distance = dx * dx + dy * dy limb_distances[i, j] = distance limb_distances[i] = np.sort(limb_distances[i]) threshold = 1500 needed_limbs = np.where(limb_distances[:, 0] < threshold)[0] whitened = whiten(c_points) x_scale = c_points[0, 0] / whitened[0, 0] y_scale = c_points[0, 1] / whitened[0, 1] if (needed_limbs.shape[0] > 0): max_k = 6 costs = np.empty(max_k - needed_limbs.shape[0]) all_kmean_points = [] for k in range(needed_limbs.shape[0], max_k): points, distortion = kmeans(whitened, k) points[:, 0] *= x_scale points[:, 1] *= y_scale points = points.astype('int32') all_kmean_points.append(points) costs[k - needed_limbs.shape[0]] = cost( points, needed_limbs) best_ind = np.argmin(costs) best_points = all_kmean_points[best_ind] for i, (x, y) in enumerate(best_points): cv.circle(im, (x, y), 2, (0, 0, 255), -1) distances = np.empty( (needed_limbs.shape[0], best_points.shape[0])) indices = np.empty( (needed_limbs.shape[0], best_points.shape[0], 2), dtype='uint8') for i in range(needed_limbs.shape[0]): limb_x, limb_y = fs[needed_limbs[i]] for j in range(best_points.shape[0]): x, y = best_points[j] dx = x - limb_x dy = y - limb_y distance = dx * dx + dy * dy distances[i, j] = distance indices[i, j, 0] = needed_limbs[i] indices[i, j, 1] = j for i in range(needed_limbs.shape[0]): i, j = np.unravel_index(np.nanargmin(distances), distances.shape) limb_ind = indices[i, j, 0] point_ind = indices[i, j, 1] new_limb_pos = (best_points[point_ind, 0], best_points[point_ind, 1]) cv.line(im, fs[limb_ind], new_limb_pos, (255, 255, 255), 1) fs[limb_ind] = new_limb_pos distances[i] = np.NaN distances[:, j] = np.NaN for i in range(num_limbs): name = names[i] x, y = fs[i] write_dict[name][mv_i, 0] = x write_dict[name][mv_i, 1] = y cv.putText(im, str(frame_n), (5, 25), cv.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255)) cv.imshow("Dots", im) if pause: k = cv.waitKey(0) else: dt = frame_dt - (time.clock() - t) dt_mili = int(dt * 1000) if (dt_mili < 1): dt_mili = 1 k = cv.waitKey(dt_mili) mv_i += 1 frame_n += 1 if k == 27: # esc key print("Frames completed:", frame_n) f_write.save(write_dict) break elif k == 32: # space key pause = not (pause) elif k == 63235 and pause: # right arrow mv_i += 1 frame_n += 1 print(stds[frame_n]) elif k == 63234 and pause: # left arrow mv_i -= 1 frame_n -= 1 print(stds[frame_n])
from numpy import vstack, array from numpy.random import rand #from scipy.cluster.vq import whiten import scipy.cluster.vq as vec # data generation with three features data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3))) # whitening of data data = vec.whiten(data) # computing K-Means with K = 3 (2 clusters) centroids, _ = vec.kmeans(data, 3) # assign each sample to a cluster clx, _ = vec.vq(data, centroids) print(data) print(centroids) print(clx)
def doKMeans(filename): ''' with open(filename) as data_file: data = json.load(data_file) ''' data = json.loads(filename) #print(data["USERS"]) #data = filename #print(data["USERS"]) vertex_matrix = [] UID = [] print(data) for user in data["USERS"]: arr = [] for VID, V_ANS in user["V_ANS"]["iphone6s"].items(): arr.append(V_ANS) vertex_matrix.append(arr) UID.append(user["UID"]) #print(arr) if (len(vertex_matrix) == 1): vertex_matrix.append(vertex_matrix[0]) print(vertex_matrix) whitened = whiten(vertex_matrix) k = math.floor(math.sqrt(len(vertex_matrix))) cluster = kmeans2(whitened, k, 99, 'points') print cluster centroid = [] sorted_centroid = [] sorted_vertex = [] i = 0 for cen in cluster[0]: x = sum(cen) j = 0 centroid.append(dict()) sorted_vertex.append(list()) sorted_centroid.append(list()) for y in cen: j += 1 centroid[i][str(j)] = y / x sorted_centroid[i] = sorted(centroid[i].items(), key=operator.itemgetter(1)) for c_list in sorted_centroid[i]: sorted_vertex[i].append(c_list[0]) i += 1 #print arr print sorted_centroid print sorted_vertex circled_vertex = [] i = 0 for arr in sorted_vertex: circled_vertex.append(list()) j = 0 for v in arr: if (j % 2 == 0): circled_vertex[i] = circled_vertex[i] + [v] else: circled_vertex[i] = [v] + circled_vertex[i] j += 1 i += 1 print circled_vertex pivot = circled_vertex[0][0] for i in range(1, len(circled_vertex)): if (circled_vertex[i].index(pivot) != 0): circled_vertex[i] = circled_vertex[i][circled_vertex[i].index( pivot):] + circled_vertex[i][0:circled_vertex[i].index(pivot)] print circled_vertex final_vertex = [] for i in range(0, len(circled_vertex[0])): vote = dict() for j in range(0, len(circled_vertex[0])): vote[str(j + 1)] = 0 for j in range(0, len(circled_vertex)): try: vote[circled_vertex[j][i]] += 1 except: vote[circled_vertex[j][i]] = 1 sorted_vote = sorted(vote.items(), key=operator.itemgetter(1)) #sorted_vote = sorted_vote[::-1] for v in reversed(sorted_vote): if (v[0] not in final_vertex): final_vertex.append(v[0]) break print(final_vertex) sphere_vertex_weight = [] for v in final_vertex: sphere_vertex_weight.append(0) for c in cluster[1]: for v in final_vertex: sphere_vertex_weight[int(v) - 1] += cluster[0][c][int(v) - 1] total_weight = sum(sphere_vertex_weight) for i in range(0, len(sphere_vertex_weight)): sphere_vertex_weight[i] = sphere_vertex_weight[i] / total_weight x = sphere_vertex_weight[i] #sphere_vertex_weight[i] = math.pow(math.sine(0.5*3.14*x),(2/3)) if (math.isnan(sphere_vertex_weight[i])): sphere_vertex_weight[i] = -1 sphere_vertex_weight = normalize(sphere_vertex_weight) print(sphere_vertex_weight) planetList = [] for i in range(0, len(cluster[0])): planetList.append({"children": []}) planetList[i]["vertex_weight"] = [] planetList[i]["name"] = "" for v in final_vertex: if (math.isnan(centroid[i][v])): centroid[i][v] = -1 planetList[i]["vertex_weight"].append(centroid[i][v]) planetList[i]["vertex_weight"] = normalize( planetList[i]["vertex_weight"]) ''' for j in range(0,len(planetList[i]["vertex_weight"])): x = planetList[i]["vertex_weight"][j] planetList[i]["vertex_weight"][j] = math.pow(math.sine(0.5*3.14*x),(2/3)) ''' for i in range(0, len(cluster[1])): planetList[cluster[1][i]]["children"].append({"name": UID[i]}) #planetList[i]["users"].append({"userId": UID[cluster[1][j]]}) print(planetList) result = { "name": "sphereList", "vertex": final_vertex, "vertex_weight": sphere_vertex_weight, "children": planetList } print(json.dumps(result)) saveJson = {"userData": [], "userCluster": []} for i in range(0, len(cluster[1])): arr = [] #arr.append(UID[j]) for j in range(0, len(vertex_matrix[i])): arr.append(vertex_matrix[i][j]) #arr.append(cluster[1][i]) saveJson["userData"].append(arr) saveJson["userCluster"].append(cluster[1][i]) with open('data/userCluster.json', 'w') as data_file: data_file.write(json.dumps(saveJson)) return result
bart_time = time() - t0 # Check it out plt.figure() plt.imshow(sos(bart_imspace)) plt.title('BART NUFFT') plt.xlabel('Recon: %g sec' % bart_time) plt.show(block=False) # The phantominator module also supports arbitrary kspace # sampling for multiple coils: kx, ky = radial(sx, spokes) kx = np.reshape(kx, (sx, spokes), 'F').flatten() ky = np.reshape(ky, (sx, spokes), 'F').flatten() k = kspace_shepp_logan(kx, ky, ncoil=nc) k = whiten(k) # We will prefer a gridding approach to keep things simple. The # helper function gridder wraps scipy.interpolate.griddata(): t0 = time() grid_imspace = gridder(kx, ky, k, sx, sx, os=os, method=method) grid_time = time() - t0 # Take a gander plt.figure() plt.imshow(sos(grid_imspace)) plt.title('scipy.interpolate.griddata') plt.xlabel('Recon: %g sec' % grid_time) plt.show(block=False) # We could also use GROG to grid
def main(): # parse options from the command line parser = argparse.ArgumentParser( prog='PROG', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''\ ------------------------------------------------------------------------------------------------------------- This is a deep neural network architecture for training sparse filters. Example uses: $ python test.py $ python test.py -m GroupSF -v 1 -g 3 -s 1 $ python test.py -m ConvolutionalSF -d 16 1 8 8 -v 1 -w y -c y -f CIFAR_data.mat -i 100 $ python test.py -m ConvolutionalSF ConvolutionalSF -d 16 1 6 6 16 16 4 4 -w y -c y -f CIFAR_data.mat -i 100 150 -t y -v 1 ------------------------------------------------------------------------------------------------------------- ''') ) parser.add_argument("-m", "--model", default=['SparseFilter'], nargs='+', help="the model type") parser.add_argument("-c", "--convolution", default="n", help="convolution, yes or no") parser.add_argument("-f", "--filename", default="patches.mat", help="the data filename") parser.add_argument("-d", "--dimensions", type=int, nargs='+', default=([100, 256]), help="the dimensions of the model: [neurons, input size] or [neurons, length, width]") parser.add_argument("-p", "--pool", type=int, nargs='+', default=None, help="pooling dimensions") parser.add_argument("-g", "--group", type=int, default=None, help="group size") parser.add_argument("-s", "--step", type=int, default=None, help="step size") parser.add_argument("-l", "--learn_rate", type=float, default=.001, help="learning rate") parser.add_argument("-i", "--iterations", type=int, nargs='+', default=[100], help="number of iterations") parser.add_argument("-v", "--verbosity", type=int, default=0, help="verbosity: 0 no plot; 1 plots") parser.add_argument("-o", "--opt", default="GD", help="optimization method: GD or L-BFGS") parser.add_argument("-w", "--whitening", default='n', help="whitening: 'y' or 'n'") parser.add_argument("-t", "--test", default='n', help="test classification performance: 'y' or 'n'") parser.add_argument("-a", "--channels", type=int, default=1, help="number of channels in data") parser.add_argument("-e", "--examples", type=int, default=None, help="number of training examples") parser.add_argument("-b", "--batch_size", type=int, default=1000, help="number of examples in [mini]batch") parser.add_argument("-z", "--aws", default='n', help="run on aws: 'y' or 'n'") parser.add_argument("-r", "--random", default='n', help="type of batches: random = 'y'") args = parser.parse_args() args.dimensions = parse_dims(args) args.iterations = parse_iter(args) ''' =================================== Load in the data =================================== ''' # load in data print "loading data..." base_path = os.path.dirname(__file__) file_path = os.path.join(base_path, "data", args.filename) data = loadmat(file_path)['X'] # reshape and preprocess data print "pre-processing data ..." video = None if args.filename == 'patches_video.mat': video = data data = data.reshape(data.shape[0] * data.shape[1], data.shape[2]).T if args.convolution == 'n': if args.whitening == 'y': data -= data.mean(axis=0) data = whiten(data.T).T elif args.whitening == 'n' and args.channels == 1: data -= data.mean(axis=0) # elif args.whitening == 'n' and args.channels == 3: # data = np.float32(data) data = np.float32(data.T) elif args.convolution == 'y': if args.filename == 'kyotoData.mat': data = np.float32(data.reshape(-1, 1, int(np.sqrt(data.shape[1])), int(np.sqrt(data.shape[1])))) data = scaling.LCNinput(data, kernel_shape=9) elif args.filename == 'CIFAR_data.mat': data = np.float32(data.reshape(-1, 1, int(np.sqrt(data.shape[1])), int(np.sqrt(data.shape[1])))) data = scaling.LCNinput(data, kernel_shape=5) data = data[0:args.examples, :, :, :] elif args.filename == 'STL_10.mat' or args.filename == 'Lenna.mat': data = np.float32(data.reshape(-1, 3, int(np.sqrt(data.shape[1] / 3)), int(np.sqrt(data.shape[1] / 3)))) data = data[0:args.examples, :, :, :] args.channels = data.shape[1] for channel in range(args.channels): data[:, channel, :, :] = np.reshape(scaling.LCNinput(data[:, channel, :, :]. reshape((data.shape[0], 1, data.shape[2], data.shape[3])), kernel_shape=9), ( data.shape[0], data.shape[2], data.shape[3])) # assert that batch size is valid and get number of batches n_batches, rem = divmod(data.shape[0], args.batch_size) assert rem == 0 # other assertions assert len(args.model) == len(args.iterations) if args.model[0] == 'GroupSF' or args.model[0] == 'GroupConvolutionalSF': assert args.group is not None assert args.step is not None # assert that the number of neurons in each layer is a perfect square for layer in xrange(len(args.dimensions)): assert np.sqrt(args.dimensions[layer][0]) % np.floor(np.sqrt(args.dimensions[layer][0])) == 0 ''' ============================= Build and train the network ============================= ''' # construct the network print "building model..." model = sf.Network( model_type=args.model, weight_dims=args.dimensions, p=args.pool, group_size=args.group, step=args.step, lr=args.learn_rate, opt=args.opt, c=args.convolution, test=args.test, batch_size=args.batch_size, random=args.random, weights=None ) # TODO: custom learning rates for each layer # compile the training, output, and test functions for the network print "compiling theano functions..." train, outputs, test = model.training_functions(data) # train the sparse filtering network print "training network..." t = time.time() cost = {} weights = {} for l in xrange(model.n_layers): cost_layer = [] w = None # iterate over training epochs if args.opt == 'GD': for epoch in xrange(args.iterations[l]): # go though [mini]batches for batch_index in xrange(n_batches): c, w = train[l](index=batch_index) cost_layer.append(c) print("Layer %i cost at epoch %i and batch %i: %f" % (l + 1, epoch, batch_index, c)) elif args.opt == 'L-BFGS': w = minimize(train[l], model.layers[l].w.eval().flatten(), method='L-BFGS-B', jac=True, options={'maxiter': args.iterations[l], 'disp': True}) if args.convolution == 'n': w = w.x.reshape(args.dimensions[0][0], args.dimensions[0][1]) elif args.convolution == 'y': w = w.x.reshape(args.dimensions[0][0], args.dimensions[0][1], args.dimensions[0][2], args.dimensions[0][3]) # add layer cost and weights to the dictionaries cost['layer' + str(l)] = cost_layer weights['layer' + str(l)] = w # calculate and display elapsed training time elapsed = time.time() - t print('Elapsed training time: %f' % elapsed) # create sub-folder for saved model if args.aws == 'n': directory_format = "./saved/%4d-%02d-%02d_%02dh%02dm%02ds" directory_name = directory_format % time.localtime()[0:6] os.mkdir(directory_name) elif args.aws == 'y': import boto from boto.s3.key import Key s3 = boto.connect_s3() my_bucket = 'dlacombejr.bucket' bucket = s3.get_bucket(my_bucket) k = Key(bucket) directory_format = "./saved/%4d-%02d-%02d_%02dh%02dm%02ds" directory_name = directory_format % time.localtime()[0:6] os.mkdir(directory_name) # save the model for later use full_path = directory_name + '/model.pkl' pickle.dump(model, open(full_path, 'w'), pickle.HIGHEST_PROTOCOL) if args.aws == 'y': k.key = full_path k.set_contents_from_filename(full_path) os.remove(full_path) # save weights separately savemat(directory_name + '/weights.mat', weights) if args.aws == 'y': k.key = directory_name + '/weights.mat' k.set_contents_from_filename(directory_name + '/weights.mat') os.remove(directory_name + '/weights.mat') # save the cost functions savemat(directory_name + '/cost.mat', cost) if args.aws == 'y': k.key = directory_name + '/cost.mat' k.set_contents_from_filename(directory_name + '/cost.mat') os.remove(directory_name + '/cost.mat') # create log file log_file = open(directory_name + "/log.txt", "wb") # todo: create log file by looping through args # for arg in args: # log_file.write( # args. # ) for m in range(len(args.model)): log_file.write( "Model layer %d: \n model:%s \n dimensions:%4s \n iterations:%3d \n" % (m, args.model[m], args.dimensions[m], args.iterations[m]) ) if args.model == 'GroupSF' or args.model == 'GroupConvolutionalSF': log_file.write( " Groups: %d \n Step: %d" % (args.group, args.step) ) ex = data.shape[0] if args.examples is not None: ex = args.examples log_file.write( " Data-set: %s \n Examples: %6d \n Whitened: %s" % (args.filename, ex, args.whitening) ) log_file.write('\nElapsed training time: %f' % elapsed) log_file.close() if args.aws == 'y': k.key = directory_name + "/log.txt" k.set_contents_from_filename(directory_name + "/log.txt") os.remove(directory_name + "/log.txt") ''' =============================== Verbosity Options ===================================== ''' # get variables and saves if args.verbosity >= 1: # # get variables of interest # activations_norm = {} # activations_raw = {} # activations_shuffled = {} # reconstruction = {} # error_recon = {} # pooled = {} # for l in xrange(len(args.dimensions)): # activations_norm['layer' + str(l)] = {} # activations_raw['layer' + str(l)] = {} # activations_shuffled['layer' + str(l)] = {} # reconstruction['layer' + str(l)] = {} # error_recon['layer' + str(l)] = {} # pooled['layer' + str(l)] = {} for batch in xrange(n_batches): # get variables of interest activations_norm = {} activations_raw = {} activations_shuffled = {} reconstruction = {} error_recon = {} pooled = {} # f_hat, rec, err, f_hat_shuffled, f, p = outputs[l]() begin = batch * args.batch_size end = begin + args.batch_size f_hat, rec, err, f_hat_shuffled, f, p = outputs[model.n_layers - 1](data[begin:end]) # activations_norm['layer' + str(l)]['batch' + str(batch)] = f_hat # activations_raw['layer' + str(l)]['batch' + str(batch)] = f # activations_shuffled['layer' + str(l)]['batch' + str(batch)] = f_hat_shuffled # reconstruction['layer' + str(l)]['batch' + str(batch)] = err # error_recon['layer' + str(l)]['batch' + str(batch)] = rec # pooled['layer' + str(l)]['batch' + str(batch)] = p # define [mini]batch title batch_title = 'layer' + str(l) + '_batch' + '%03d' % batch # define norm and raw file names norm_file_name = directory_name + '/activations_norm_' + batch_title + '.mat' raw_file_name = directory_name + '/activation_raw_' + batch_title + '.mat' activations_norm[batch_title] = f_hat activations_raw[batch_title] = f activations_shuffled[batch_title] = f_hat_shuffled reconstruction[batch_title] = err error_recon[batch_title] = rec pooled[batch_title] = p # save model as well as weights and activations separately savemat(norm_file_name, activations_norm) # savemat(raw_file_name, activations_raw) if args.aws == 'y': k.key = norm_file_name k.set_contents_from_filename(norm_file_name) os.remove(norm_file_name) # k.key = raw_file_name # k.set_contents_from_filename(raw_file_name) # os.remove(raw_file_name) # savemat(directory_name + '/weights.mat', weights) # if args.aws == 'y': # k.key = directory_name + '/weights.mat' # k.set_contents_from_filename(directory_name + '/weights.mat') # os.remove(directory_name + '/weights.mat') # # f_hat, rec, err, f_hat_shuffled, f, p = outputs[l]() # f_hat, rec, err, f_hat_shuffled, f, p = outputs[l](data[0:args.batch_size]) # # activations_norm['layer' + str(l)] = f_hat # activations_raw['layer' + str(l)] = f # activations_shuffled['layer' + str(l)] = f_hat_shuffled # reconstruction['layer' + str(l)] = err # error_recon['layer' + str(l)] = rec # pooled['layer' + str(l)] = p # # # save model as well as weights and activations separately # savemat(directory_name + '/weights.mat', weights) # savemat(directory_name + '/activations_norm.mat', activations_norm) # savemat(directory_name + '/activation_raw.mat', activations_raw) # output helper file for concatenating activations helper = {'batches': n_batches, 'output_size': f_hat.shape} helper_file_name = directory_name + '/helper.mat' savemat(helper_file_name, helper) if args.aws == 'y': k.key = helper_file_name k.set_contents_from_filename(helper_file_name) os.remove(helper_file_name) # get data if not on AWS if args.aws == 'n': f_hat, rec, err, f_hat_shuffled, f, p = outputs[model.n_layers - 1](data) activations_norm = {"layer0": f_hat} # display figures if args.verbosity == 2: # if GD, plot the cost function over time if args.opt == 'GD': visualize.plotCost(cost) # visualize the receptive fields of the first layer visualize.drawplots(weights['layer0'].T, color='gray', convolution=args.convolution, pad=0, examples=None, channels=args.channels) # visualize the distribution of lifetime and population sparseness for l in xrange(len(args.dimensions)): layer = 'layer' + str(l) if args.convolution == 'n': visualize.dispSparseHist(activations_norm[layer], l) elif args.convolution == 'y': visualize.dispSparseHist(activations_shuffled[layer].reshape(args.dimensions[l][0], data.shape[0] * activations_shuffled[layer].shape[2] * activations_shuffled[layer].shape[3]), layer=l) # visualize the distribution of activity across the "cortical sheet" and reconstruction if args.filename == 'patches_video.mat': f_hat = activations_norm['layer0'].T.reshape(video.shape[0], video.shape[1], args.dimensions[0][0]) visualize.videoCortex(f_hat[0:100, :, :], 'y', args.convolution, 1) else: visualize.drawplots(activations_norm['layer0'], color='gray', convolution=args.convolution, pad=1, examples=100) # # visualize reconstruction capabilities # if args.convolution == 'n': # visualize.drawReconstruction(data[:, 0:100], error_recon['layer0'][:, 0:100], 'y', args.convolution, 1) # elif args.convolution == 'y': # visualize.convolutional_reconstruction(data[0, :, :, :], activations_raw['layer0'], weights['layer0'], # color='gray', convolution=args.convolution) # print('Reconstructed error: %e' % reconstruction['layer0']) # additional visualizations for convolutional network if args.convolution == 'y': dim = activations_raw['layer0'].shape[2] # visualize an example of a convolved image visualize.visualize_convolved_image(activations_raw['layer0'], dim=dim) # print activations_raw['layer0'] # visualize max-pooled activations and LCN output visualize.visualize_convolved_image(pooled['layer0'][0, :, :, :].reshape(1, pooled['layer0'].shape[1], pooled['layer0'].shape[2], pooled['layer0'].shape[3]), dim=dim / 2) # visualize an example of a LCNed convolved image after max pooling # temp = activations_raw['layer0'] #[0, :, :, :] temp = pooled['layer0'] #[0, :, :, :] # print temp.shape for i in range(temp.shape[1]): temp[0, i, :, :] = scaling.LCNinput(temp[0, i, :, :].reshape((1, 1, dim / 2, dim / 2)), kernel_shape=5) # temp = scaling.LCNinput(temp, kernel_shape=5) visualize.visualize_convolved_image(temp, dim=dim / 2) # print temp ''' ================================ Test the Model ======================================= ''' # test the model if evaluating classification performance if args.test == 'y': from sklearn import svm from sklearn.metrics import confusion_matrix train_labels = loadmat(file_path)['y'] file_path = os.path.join(base_path, "data", "CIFAR_test.mat") test_data = loadmat(file_path)['X'] test_labels = loadmat(file_path)['y'] # reshape and normalize the data if args.convolution == 'y': test_data = np.float32(test_data.reshape(-1, 1, int(np.sqrt(test_data.shape[1])), int(np.sqrt(test_data.shape[1])))) test_data = scaling.LCNinput(test_data, kernel_shape=5) test_data = test_data[0:args.examples, :, :, :] # get SVM test results for pixels to last layer train_input = None for layer in range(model.n_layers + 1): # pixel inputs if layer == 0: test_input = test_data.reshape(test_data.shape[0], test_data.shape[1] * test_data.shape[2] * test_data.shape[3]) train_input = data.reshape(data.shape[0], data.shape[1] * data.shape[2] * data.shape[3]) # hidden layers elif layer > 0: # get the output of the current layer in the model given the training / test data and then reshape # TODO: use raw output as training and testing data? test_input = test[layer - 1](test_data[0:args.batch_size]) test_input = test_input[0].reshape(test_input[0].shape[0], test_input[0].shape[1] * test_input[0].shape[2] * test_input[0].shape[3]) train_input = activations_norm['layer' + str(layer - 1)] train_input = train_input.reshape(train_input.shape[0], train_input.shape[1] * train_input.shape[2] * train_input.shape[3]) # train linear support vector machine clf = svm.SVC(kernel="linear").fit(train_input, np.ravel(train_labels[0:args.examples])) # get predictions from SVM and calculate accuracy predictions = clf.predict(test_input) accuracy = clf.score(test_input, test_labels[0:args.examples]) # display results and log them print("Accuracy of the classifier at layer %1d: %0.4f" % (layer, accuracy)) cm = confusion_matrix(test_labels[0:args.examples], predictions) log_file = open(directory_name + "/log.txt", "a") log_file.write( "\nAccuracy of the classifier at layer %1d: %0.4f" % (layer, accuracy) ) log_file.close() # visualize the confusion matrix if args.test == 'y' and args.verbosity == 2: import pylab as pl pl.imshow(cm, interpolation='nearest') pl.title('Confusion Matrix for Network') pl.colorbar() pl.ylabel('True Label') pl.xlabel('Predicted Label') pl.show()