def scikit_pca(model, rel_wds, plot_lims, title, cluster="kmeans"): """ Given a word2vec model and a cluster (choice of "kmeans" or "spectral") Make a plot of all word-vectors in the model. """ X, keys = make_data_matrix(model) for i, key in enumerate(keys): X[i,] = model[key] if cluster == "kmeans": k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X) elif cluster == "spectral": sp_clust = SpectralClustering() labels = sp_clust.fit_predict(X) # PCA X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X_transf = sklearn_pca.fit_transform(X_std) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims) return sklearn_pca.explained_variance_ratio_
def add_kmeans_col(self, iter = 1000, n_init = 10, n = 4): '''Add a new k_means cluster column to X data''' logging.info('Adding kmeans %d clusters to X' %(n)) km = KMeans(n_clusters=n, max_iter=iter, n_init=n_init) km.fit(self.X[:,1:]) # XXX: This might not be kosher as it affects all of X self.models['km-col'] = km self.X = np.hstack( (self.X, km.predict(self.X[:,1:]).reshape(-1,1)) )
def get_modelKmeans(): # Connect to a pre-existing cluster # connect to localhost:54321 # Log.info("Importing benign.csv data...\n") benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) # benign_h2o.summary() benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",") # Impute missing values with column mean imp = Imputer(missing_values="NaN", strategy="mean", axis=0) benign_sci = imp.fit_transform(benign_sci) for i in range(2, 7): # Log.info("H2O K-Means") km_h2o = H2OKMeansEstimator(k=i) km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o) km_h2o.show() model = h2o.get_model(km_h2o._id) model.show() km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1) km_sci.fit(benign_sci) print "sckit centers" print km_sci.cluster_centers_
def re_classify_dict(): dict_file = open("_dictionary.pickle", "rb") sc_list = cPickle.load(dict_file) sc_list = np.concatenate(sc_list) Dh_dict = sc_list[:, 144:] Dl_dict = sc_list[:, :144] k_means = KMeans(n_clusters=15) k_means = k_means.fit(Dl_dict) y_predict = k_means.predict(Dl_dict) num = [] y_tmp = np.asarray(y_predict, dtype=int) * 0 + 1 for i in range(len(np.unique(y_predict))): num.append(np.sum(y_tmp[y_predict == i])) rand = np.asarray(num).argsort() # 按照各个类别patch个数从少到多排序的类别索引 classified_hdict = [] classified_patch = [] for i in rand: predict_temp = y_predict == i classified_hdict.append(Dh_dict[predict_temp]) print len(classified_hdict[-1]) for i in range(9): x = i % 3 y = i / 3 # 进行一次系数编码测试 patch_show(classified_hdict[i+5][:100], [0.05+x*0.31, 0.05+y*0.31, 0.3, 0.3], i) plt.show()
def Kmeans_cluster_analysis(x,y,n_clusters): X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1)))) X = Scaler().fit_transform(X) km = KMeans(n_clusters) km.fit(X) labels = km.labels_ cluster_centers = km.cluster_centers_ labels_unique = set(labels) #np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') #colors = pl.cm.Spectral(np.linspace(0, 1, len(labels_unique))) for i in xrange(len(labels_unique)): my_members = labels == i cluster_center = cluster_centers[i] plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7) plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i]) tolx = (X[:,0].max()-X[:,0].min())*0.03 toly = (X[:,1].max()-X[:,1].min())*0.03 plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx) plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly) plt.show() return labels
def csv_parser(fileName): data = open(fileName, 'rU').readlines() outfile = fileName[:-4] + '_kmeans.csv' fhout = open(outfile, 'w') outfile = data[0].strip() + ',Label' + '\n' fhout.write(outfile) vaf = [] for line in data[1:]: flds = line.split(',') vaf.append([float(flds[7]), float(flds[8])]) print vaf[:5] vaf_np = np.array(vaf) print len(vaf_np) print vaf_np[:5] kmeansModel = KMeans(k=6, init='k-means++', n_init=100, max_iter=3000) labels = kmeansModel.fit_predict(vaf_np) ## clustDist = model.transform(vaf_np) print labels[:30] for j in range(1, len(data)): outline = data[j].strip() + ',' + str(labels[j-1]) + '\n' fhout.write(outline) fhout.close()
def initialize_hypers(self, W): mu_0 = W.mean(axis=(0,1)) sigma_0 = np.diag(W.var(axis=(0,1))) # Set the global cov nu_0 = self._cov_model.nu_0 self._cov_model.sigma_0 = sigma_0 * (nu_0 - self.B - 1) # Set the mean for c1 in xrange(self.C): for c2 in xrange(self.C): self._gaussians[c1][c2].mu_0 = mu_0 self._gaussians[c1][c2].sigma = self._cov_model.sigma_0 self._gaussians[c1][c2].resample() if self.special_case_self_conns: W_self = W[np.arange(self.N), np.arange(self.N)] self._self_gaussian.mu_0 = W_self.mean(axis=0) self._self_gaussian.sigma_0 = np.diag(W_self.var(axis=0)) self._self_gaussian.resample() # Cluster the neurons based on their rows and columns from sklearn.cluster import KMeans features = np.hstack((W[:,:,0], W[:,:,0].T)) km = KMeans(n_clusters=self.C) km.fit(features) self.c = km.labels_.astype(np.int) print "Initial c: ", self.c
def update_clusters(): num_reviews = Review.objects.count() update_step = ((num_reviews/100)+1) * 5 if num_reviews % update_step == 0: # Create a sparse matrix from user reviews all_usernames = map(lambda x: x.username, User.objects.only("username")) all_wine_ids = set(map(lambda x: x.wine.id, Review.objects.only("wine"))) num_users = len(all_usernames) # m is often used to denote a matrix ratings_m = dok_matrix((num_users, max(all_wine_ids)+1), dtype=np.float32) for i in range(num_users): # each user corresponds to a row, in the order of all_usernames user_reviews = Review.objects.filter(user_name=all_usernames[i]) for user_review in user_reviews: ratings_m[i,user_review.wine.id] = user_review.rating # Perform kmeans clustering k = int(num_users / 10) + 2 kmeans = KMeans(n_clusters=k) clustering = kmeans.fit(ratings_m.tocsr()) # Update clusters Cluster.objects.all().delete() new_clusters = {i: Cluster(name=i) for i in range(k)} for cluster in new_clusters.values(): # clusters need to be saved before referring to users cluster.save() for i,cluster_label in enumerate(clustering.labels_): new_clusters[cluster_label].users.add(User.objects.get(username=all_usernames[i]))
def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol): # check that fit.predict gives same result as fit_predict # There's a very small chance of failure with elkan on unstructured dataset # because predict method uses fast euclidean distances computation which # may cause small numerical instabilities. # NB: This test is largely redundant with respect to test_predict and # test_predict_equal_labels. This test has the added effect of # testing idempotence of the fittng procesdure which appears to # be where it fails on some MacOS setups. if sys.platform == "darwin": pytest.xfail( "Known failures on MacOS, See " "https://github.com/scikit-learn/scikit-learn/issues/12644") if not (algo == 'elkan' and constructor is sp.csr_matrix): rng = np.random.RandomState(seed) X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[0].astype(dtype, copy=False) X = constructor(X) kmeans = KMeans(algorithm=algo, n_clusters=10, random_state=seed, tol=tol, max_iter=max_iter, n_jobs=1) labels_1 = kmeans.fit(X).predict(X) labels_2 = kmeans.fit_predict(X) assert_array_equal(labels_1, labels_2)
def kmeans_clustering(matrix, N): km = KMeans(n_clusters=N, n_jobs=-1) clusters = km.fit_predict(matrix) res = [[] for _ in range(N) ] for i, c in enumerate(clusters): res[c].append(i) return res
def treeGenerator(self, rootLabel, points,names): # rootLabel is label of root # points is list of Feature Vectors # names is the name of the image corresponding Feature vector is in # print rootLabel, len(points) if len(points) < self.threshold: self.adjancency[rootLabel]=[] if rootLabel not in self.leafLabels: self.leafLabels.append(rootLabel) return else: localModel = KMeans(n_clusters = self.branches,n_jobs=4) localModel.fit(points) adj = [] localTree = {} for i in localModel.cluster_centers_: self.treeMap[self.nodes]=i self.nodeImages[self.nodes]=[] # A map for node and the Images It has localTree[tuple(i)]=self.nodes adj.append(self.nodes) self.nodes = self.nodes + 1 self.adjancency[rootLabel]=adj localClusterPoints = [[] for i in range(self.branches)] localClusterImgNames = [[] for i in range(self.branches)] # A local array to store which FV is in which cluster for i in range(len(points)): localClusterPoints[localModel.labels_[i]].append(points[i]) localClusterImgNames[localModel.labels_[i]].append(names[i]) if names[i] not in self.nodeImages[localTree[tuple(localModel.cluster_centers_[localModel.labels_[i]])]]: self.nodeImages[localTree[tuple(localModel.cluster_centers_[localModel.labels_[i]])]].append(names[i]) for i in range(self.branches): thisClusterCenter = tuple(localModel.cluster_centers_[i]) self.treeGenerator(localTree[thisClusterCenter],localClusterPoints[i],localClusterImgNames[i])
def make_tsne_plot(model, rel_wds, plot_lims, title): dim = 30 X, keys = make_data_matrix(model) # first we actually do PCA to reduce the # dimensionality to make tSNE easier to calculate X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X = sklearn_pca.fit_transform(X_std)[:,:dim] # do downsample k = 5000 sample = [] important_words = [] r_wds = [word[0] for word in rel_wds] for i, key in enumerate(keys): if key in r_wds: sample.append(i) sample = np.concatenate((np.array(sample), np.random.choice(len(keys), k-10, replace = False), )) X = X[sample,:] keys = [keys[i] for i in sample] # Do tSNE tsne = TSNE(n_components=2, random_state=0, metric="cosine") X_transf = tsne.fit_transform(X) k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X_transf) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims)
def showClustering(data): kmeans = KMeans() kmeans.fit(data) labels = kmeans.labels_ uniqueLabels = numpy.unique(labels) nCluster = len(uniqueLabels) centers = kmeans.cluster_centers_ import matplotlib.pyplot as plt from itertools import cycle colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') plt.figure(1) plt.clf() for center in centers: print center for k,col in zip(range(nCluster),colors): members = labels == k print "plotting %dth cluster" % k print "label type" ,labels, type(labels) print "members are:", members, type(members) print "data[members,0]",data[members,0],type(data[members,0]) center = centers[k] plt.plot(data[members,0],data[members,1],col +'.') plt.plot(center[0],center[1],'o',markerfacecolor=col, markeredgecolor = 'k',markersize = 14) plt.title("clusters") plt.show()
def kmeans_cluster(G, graph_name, num_clusters): subgraphs = [] #Find a way to figure out clusters number automatically write_directory = os.path.join(Constants.KMEANS_PATH,graph_name) if not os.path.exists(write_directory): os.makedirs(write_directory) nodeList = G.nodes() matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList) kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) kmeans.fit(matrix_data) label = kmeans.labels_ clusters = {} for nodeIndex, nodeLabel in enumerate(label): if nodeLabel not in clusters: clusters[nodeLabel] = [] clusters[nodeLabel].append(nodeList[nodeIndex]) #countNodes is used to test whether we have all the nodes in the clusters countNodes = 0 for clusterIndex, subGraphNodes in enumerate(clusters.keys()): subgraph = G.subgraph(clusters[subGraphNodes]) subgraphs.append(subgraph) nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+Constants.GEXF_FORMAT)) #countNodes = countNodes + len(clusters[subGraphNodes]) pass return num_clusters
def kmeans(content_list): tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, \ lowercase=False) ''' tokenizer: 指定分词函数 lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理, 所以最好是False ''' tfidf_matrix = tfidf_vectorizer.fit_transform(content_list) num_clusters = 20 km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=8, \ init='k-means++',n_jobs=8) ''' n_clusters: 指定K的值 max_iter: 对于单次初始值计算的最大迭代次数 n_init: 重新选择初始值的次数 init: 制定初始值选择的算法 n_jobs: 进程个数,为-1的时候是指默认跑满CPU 注意,这个对于单个初始值的计算始终只会使用单进程计算, 并行计算只是针对与不同初始值的计算。比如n_init=10,n_jobs=40, 服务器上面有20个CPU可以开40个进程,最终只会开10个进程 ''' #返回各自文本的所被分配到的类索引 result = km_cluster.fit_predict(tfidf_matrix) print "Predicting result: ", result return result
def match_line_cluster(gdf1, gdf2): """ Try to match two layers of linestrings with KMeans cluster analysis based on a triplet of descriptive attributes : (centroid coords., rounded length, approximate bearing) Parameters ---------- gdf1: GeoDataFrame The reference dataset. gdf2: GeoDataFrame The collection of LineStrings to match. Returns ------- matching_table: pandas.Series A table (index-based on *gdf1*) containing the id of the matching feature found in *gdf2*. """ param1, param2 = list(map(mparams, [gdf1, gdf2])) k_means = KMeans(init='k-means++', n_clusters=len(gdf1), n_init=10, max_iter=1000) k_means.fit(np.array((param1+param2))) df1 = pd.Series(k_means.labels_[len(gdf1):]) df2 = pd.Series(k_means.labels_[len(gdf1):]) # gdf1['fid_layer2'] = \ # df1.apply(lambda x: df2.where(gdf2['key'] == x).notnull().nonzero()[0][0]) return pd.DataFrame( index=list(range(len(gdf1))), data=df1.apply( lambda x: df2.where(df2 == x).notnull().nonzero()) )
def pca_k_means(self): if not self.pca_reduced: self.pc_analysis() kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10) kmeans.fit(self.pca_reduced, self.player_value) h = .02 x_min, x_max = self.pca_reduced[:, 0].min() - 1, self.pca_reduced[:, 0].max() + 1 y_min, y_max = self.pca_reduced[:, 1].min() - 1, self.pca_reduced[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(self.pca_reduced[:, 0], self.pca_reduced[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ labels = self.pca_labels = kmeans.labels_ intertia = kmeans.inertia_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n' 'Centroids are marked with white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) return {'plt': plt, 'centroids': centroids, 'labels': labels, 'inertia': intertia}
def Corpus_K_Means(TestSample,num_topic): Theta = TestSample.Theta ThetaPredict = np.zeros(Theta.shape) W = TestSample.Word W = np.array(W,dtype='double') estimators = KMeans(n_clusters=num_topic,n_init=5) estimators.fit(W) BetaPredict=estimators.cluster_centers_ Q = 2*BetaPredict.dot(BetaPredict.transpose()) Q = matrix(Q) P = W.dot(BetaPredict.transpose()) G = -np.eye(num_topic) G = matrix(G) h = np.zeros([num_topic,1]) h = matrix(h) A = np.ones([1,num_topic]) A = matrix(A) b = matrix(1.0) solvers.options['show_progress'] = False for i in range(num_topic): p = matrix(P[[i],:].transpose()) sol=solvers.qp(Q, p, G, h, A, b) ThetaPredict[:,[i]] = np.array(sol['x']) Err = ThetaPredict - Theta return np.square(np.linalg.norm(Err))
class AdvancedModel(): clusters = [] # price class regression price_reg = LinearRegression() def fit(self, X_train, y_train, n_clusters=4): y_train_mat = np.array(y_train).reshape((-1,1)) # 1. determine clusters self.km = KMeans(n_clusters=5) self.km.fit(y_train_mat) clusters = self.km.cluster_centers_ cluster_indices = self.km.predict(y_train_mat) print(clusters) # 2. fit naive bayes #self.nb.fit(X_train, ...) #self # 3. train regression model #price_reg.fit def predict(self, X): pass def get_weights(self): return np.append(self.price_reg.coef_, [self.price_reg.intercept_]) def set_weights(self, w): self.price_reg.coef_ = w[:-1] self.price_reg.intercept_ = w[-1]
def cluster(dat): kmean=KMeans(init='k-means++', n_clusters=numclusters, n_init=10) y=kmean.fit_predict(dat) partition=[[] for i in range(numclusters)] for i in range(len(dat)): partition[y[i]].append(dat[i]) return [partition,kmean]
def iris_h2o_vs_sciKmeans(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) # connect to localhost:54321 iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) iris_sci = np.genfromtxt(h2o.locate("smalldata/iris/iris.csv"), delimiter=',') iris_sci = iris_sci[:,0:4] s =[[4.9,3.0,1.4,0.2], [5.6,2.5,3.9,1.1], [6.5,3.0,5.2,2.0]] start = h2o.H2OFrame(s) start_key = start.send_frame() h2o_km = h2o.kmeans(x=iris_h2o[0:4], k=3, user_points=start_key, standardize=False) sci_km = KMeans(n_clusters=3, init=np.asarray(s), n_init=1) sci_km.fit(iris_sci) # Log.info("Cluster centers from H2O:") print "Cluster centers from H2O:" h2o_centers = h2o_km.centers() print h2o_centers # Log.info("Cluster centers from scikit:") print "Cluster centers from scikit:" sci_centers = sci_km.cluster_centers_.tolist() print sci_centers for hcenter, scenter in zip(h2o_centers, sci_centers): for hpoint, spoint in zip(hcenter,scenter): assert (hpoint- spoint) < 1e-10, "expected centers to be the same"
def main(): songIds = open("songIDsofFirst100Users.txt","r") try: for line in songIds: songIDsToCluster.append(int(line)) finally: songIds.close() print len(songIDsToCluster) f= sio.loadmat('/home/dmitriy/workspace/MLFinalProject/MatlabFiles/finalVectors.mat') full = np.nan_to_num(np.matrix(f['finalVectors'])) # fullSplit = np.array_split(full, 360) # print("Done Reading") # mtx = fullSplit[0] # print(len(mtx)) mtx = full[songIDsToCluster] mtx /= np.max(np.abs(mtx),axis=0) for clusters in range(25,50): errors = 0 num_clusters = clusters ClusteringKmeans = KMeans(n_clusters=num_clusters) ClusteringKmeans.fit(mtx) result = ClusteringKmeans.labels_ #silhouette = metrics.silhouette_score(mtx,result,metric='euclidean') #plot(mtx,result) writeSongIDandClusterToFile(result,clusters) print("Clusters:", clusters, "Retest Error:", errors)
def run_kmeans(gene_folder, n_clusters): pars, fitness = load_all_generations_as_DataFrame(gene_folder) kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(pars) means = map(lambda c: fitness[kmeans.labels_ == c].mean()['longest_interval_within_margin'], range(n_clusters)) stds = map(lambda c: fitness[kmeans.labels_ == c].std()['longest_interval_within_margin'], range(n_clusters)) return kmeans, means, stds
def perform_cluster_analysis(dataset): filename = 'elbow_plot.dat' if os.path.exists(cpath + filename): data = joblib.load(cpath + filename) K = data[0] meandistortions = data[1] else: X = dataset print 'X Shape: ', X.shape #K = range(1, 50, 5) K = [1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] #K = [1, 2, 5, 10, 50, 100] meandistortions = [] cluster_centers = [] for k in K: print k kmeans = KMeans(n_clusters=k, n_jobs=3) kmeans.fit(X) #import ipdb; ipdb.set_trace() # debugging code #meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0]) meandistortions.append(kmeans.inertia_) cluster_centers.append(kmeans.cluster_centers_) #print 'k: ', k, ' Cluster Centers: ', kmeans.cluster_centers_ data = [K, meandistortions] joblib.dump(data, cpath + filename, compress=8) plot_name = "elbow_plot.png" title = 'Selecting k with the Elbow Method' xlabel = 'Number of Clusters (k)' ylabel = 'Average Distortion' xyplot(K, meandistortions, 0, 0, 0, 0, title, xlabel, ylabel, staticpath + plot_name, line=1, y_log=0)
def fit(self, X): """ :param X: :return: """ lcl = range(1, self._maxc+1) # compute the fractal dimension ldistorsion = [] for i in range(1, self._maxc+1): cluster = KMeans(n_clusters=i, n_jobs=-1) cluster.fit(X) ldistorsion.append(within_scatter_matrix_score(X, cluster.labels_)) print(X.shape[1]) print(ldistorsion) PCF = [] for x,y in zip(ldistorsion, lcl): print(x,y, np.power(y, 2.0/X.shape[1])) PCF.append(x * np.power(y, 2.0/X.shape[1])) print(PCF) self._M = np.argmin(PCF) print(self._M)
def reduce_colors(image, n_clusters): image = img_as_float(image) height = len(image) width = len(image[0]) image = image.reshape((height*width,3)) image_mean = {} image_median = {} kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=241) classes = kmeans.fit_predict(image) means, medians = [], [] for cl in range(n_clusters): means.append( np.mean(image[classes == cl], axis = 0)) medians.append( np.median(image[classes == cl], axis = 0)) image_mean = image.copy().astype(float) image_median = image.copy().astype(float) for cl in range(n_clusters): image_mean[classes == cl] = means[cl] image_median[classes == cl] = medians[cl] logging.info('Clusters: %s, PSNR(mean): %s, PSRN(median): %s'%(n_clusters, PSNR(image, image_mean), PSNR(image, image_median))) image_mean = image_mean.reshape(height,width,3) string_image = StringIO() plt.imsave(string_image, image_mean) return string_image
def partition_FOV_KMeans(self,tradeoff_weight=.5,fx=.25,fy=.25,n_clusters=4,max_iter=500): """ Partition the FOV in clusters that are grouping pixels close in space and in mutual correlation Parameters ------------------------------ tradeoff_weight:between 0 and 1 will weight the contributions of distance and correlation in the overall metric fx,fy: downsampling factor to apply to the movie n_clusters,max_iter: KMeans algorithm parameters Outputs ------------------------------- fovs:array 2D encoding the partitions of the FOV mcoef: matric of pairwise correlation coefficients distanceMatrix: matrix of picel distances Example """ _,h1,w1=self.shape self.resize(fx,fy) T,h,w=self.shape Y=np.reshape(self,(T,h*w)) mcoef=np.corrcoef(Y.T) idxA,idxB = np.meshgrid(list(range(w)),list(range(h))); coordmat=np.vstack((idxA.flatten(),idxB.flatten())) distanceMatrix=euclidean_distances(coordmat.T); distanceMatrix=old_div(distanceMatrix,np.max(distanceMatrix)) estim=KMeans(n_clusters=n_clusters,max_iter=max_iter); kk=estim.fit(tradeoff_weight*mcoef-(1-tradeoff_weight)*distanceMatrix) labs=kk.labels_ fovs=np.reshape(labs,(h,w)) fovs=cv2.resize(np.uint8(fovs),(w1,h1),old_div(1.,fx),old_div(1.,fy),interpolation=cv2.INTER_NEAREST) return np.uint8(fovs), mcoef, distanceMatrix
def findColor(frame): t = time() # dim = np.array(frame.size)/2 # frame.thumbnail(dim, Image.ANTIALIAS) # print "Thumbnail in %0.3f seconds." % (time() - t) # t = time() points = imresize(np.array(frame, dtype=np.float64), 0.3) w,h,d = points.shape data = np.reshape(points, (w*h, d)) sample = shuffle(data, random_state=0)[:len(data)/3] print "Reshape and shuffle in %0.3f seconds." % (time() - t) t = time() kmeans = KMeans(n_clusters=k_colors, n_jobs=jobs).fit(sample) labels = kmeans.predict(data) print "Fit and predict in %0.3f seconds." % (time() - t) t = time() colors = [map(int, color) for color in kmeans.cluster_centers_] # hsvs = np.array([rgb_to_hsv(*values) for values in colors]) # frequent = np.argmax(hsvs[:,1]) # frequent = colors[frequent] print "Found in %0.3f seconds." % (time() - t) frequents = defaultdict(int) for l in labels: frequents[l] += 1 frequents = sorted(frequents.items(), key=lambda x:x[1], reverse=True) frequents = [colors[i[0]] for i in frequents[:3]] # print "Counted in %0.3f seconds." % (time() - t) # print "Top 3 colors [RGB]: ", frequents[:3] return frequents[2] if len(frequents) == 3 else frequents[0]
def makecluster(): n_points=6 n_dim=2 n_clusters=6 model=KMeans(init='k-means++',n_clusters=4,n_init=10) data=np.zeros((16,2)) #print data data1=np.array(temp) data[0:4,:]=2 data[4:8,:]=1 data[8:12:,:]=-1 data[12:16,:]=-2 data[(0,4,8,12),1]=2 data[(1,5,9,13),1]=1 data[(2,6,10,14),1]=-1 data[(3,7,11,15),1]=-2 #data[3,1]=2 #data[4,1]=3 #data[5,1]=2 #data[0,1]=3 model.fit(data1) print data1 print model.labels_
def create_fiveline(image): edges = cv2.Canny(image, 50, 150, apertureSize=3) ys = list() minLineLength = 1 maxLineGap = 10 lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 70, minLineLength, maxLineGap) for line in lines: for x1, y1, x2, y2 in line: cv2.line(image, (x1,y1), (x2,y2), (0, 255, 0), 2) if (abs(y1 - y2 < 4)): innerlist = list() innerlist.append((y1 + y2) / 2) ys.append(innerlist) cv2.imwrite('images/houghlines.jpg', image) display_image(image) kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10) kmeans.fit(np.asarray(ys)) fiveline = list() for innerlist in kmeans.cluster_centers_: fiveline.append(innerlist[0]) fiveline.sort() print "K-MEANS centers" print fiveline return fiveline
model = KeyedVectors.load_word2vec_format( './data/GoogleNews-vectors-negative300.bin.gz', binary=True ) # 国名の取得 countries = set() with open('data/analogy_data_add.txt', 'r') as f: for line in f: line = line.split() if line[0] in ['capital-common-countries', 'capital-world']: countries.add(line[2]) elif line[0] in ['currency', 'gram6-nationality-adjective']: countries.add(line[1]) countries = list(countries) # 単語ベクトルの取得 countries_vec = [model[country] for country in countries] from sklearn.cluster import KMeans import numpy as np # k-meansクラスタリング kmeans = KMeans(n_clusters=5) kmeans.fit(countries_vec) for i in range(5): cluster = np.where(kmeans.labels_ == i)[0] print('cluster', i) print(', '.join([countries[k] for k in cluster]))
def k_means(feature_matrix, num_clusters=10): km = KMeans(n_clusters=num_clusters, max_iter=10000) km.fit(feature_matrix) clusters = km.labels_ return km, clusters
plt.figure(figsize=(20,20)) for index, (image, label) in enumerate(zip(train_images[0:100], clusterAssignement[0:100])): plt.subplot(5, 20, index + 1) plt.axis("off") plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray) plt.title(label, fontsize = 20) plt.show() ## comparison to the sklearn algorithm pca = PCA(n_components=10) kmeans = KMeans(n_clusters=10,n_init=1) predictor = Pipeline([('pca', pca), ('kmeans', kmeans)]) predict = predictor.fit(test_images).predict(test_images) acc = 0 for i in range(len(predict)): acc += predict[i] == test_labels[i] print("accuracy = ", acc/len(predict)) plt.figure(figsize=(20,20)) for index, (image, label) in enumerate(zip(train_images[0:100], predict[0:100])): plt.subplot(5, 20, index + 1) plt.axis("off") plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray) plt.title(label, fontsize = 20)
# Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [((e_Values[i]), e_Vectors[:,i]) for i in range(len(e_Values))] # Sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort(key=lambda x: x[0], reverse=True) #projection matrix matrix_w = np.hstack((eig_pairs[0][1].reshape(13,1), eig_pairs[1][1].reshape(13,1) )) #kmeans clustering startingpoint = np.vstack((XTrain[0,],XTrain[1,])) kmeans_model = KMeans(algorithm='full', copy_x=True, init=startingpoint,max_iter=300,\ n_clusters=2, n_init=1).fit(data_scaled) #centroid values the algorithm generated y_predict=kmeans_model.fit_predict(data_scaled) #print(y_predict) centroids = kmeans_model.cluster_centers_ print("The Centroids:",centroids) #Projecting the centered data transformed = np.dot(data_scaled,matrix_w) print(transformed)
model.add(Conv2D(16, kernel_size=(3,3), activation='relu', padding='same')) model.add(MaxPooling2D(pool_size=(2,2),padding='same')) model.add(Conv2D(8, kernel_size=(3,3), activation='relu', padding='same')) model.add(MaxPooling2D(pool_size=(2,2),padding='same')) # decoder part : (conv + relu + upsampling) x 3 model.add(Conv2D(8, kernel_size=(3,3), activation='relu', padding='same')) model.add(UpSampling2D(size=(2,2))) model.add(Conv2D(16, kernel_size=(3,3), activation='relu', padding='same')) model.add(UpSampling2D(size=(2,2))) model.add(Conv2D(32, kernel_size=(3,3), activation='relu', padding='same')) model.add(UpSampling2D(size=(2,2))) model.add(Conv2D(3, kernel_size=(3,3), activation='sigmoid', padding='same')) # compile and train the model model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy']) model.fit(X, X, epochs=10, batch_size=5, shuffle=True, verbose=1) #---------- 2. Retrieve encoded image and classify pathways ---------- get_encoded_layer = backend.function([model.layers[0].input],[model.layers[5].output]) encoded_layer = get_encoded_layer([X])[0] X_encoded = encoded_layer.reshape(encoded_layer.shape[0], -1) km = KMeans(n_clusters) km.fit(X_encoded) #---------- 3. Print percentage of each path and corresponding example image ---------- X_clustered = km.labels_ N = float(len(X_clustered)) paths, counts = np.unique(X_clustered, return_counts=True) print "---Output---" for each_path, each_count in zip(paths, counts): idx = np.where(X_clustered==each_path)[0][0] print "path%d (%.2f) %s"%(each_path+1, each_count/N, img_list[idx])
# 'key', # 'loudness', # 'mode', 'speechiness', 'acousticness', 'instrumentalness', # 'liveness', 'valence', # 'tempo', # 'time_signature' ] data = np.array([[track[k] for k in feature_keys] for track in features]) std_data = StandardScaler().fit_transform(data) clustering = KMeans(n_clusters=N_CLUSTERS, random_state=123) clustering.fit(std_data) cluster_labels = clustering.labels_ tsne = TSNE(n_components=2, random_state=123) reduced = tsne.fit_transform(std_data) df = pd.DataFrame(data) df.columns = feature_keys df['x'] = reduced[:, 0] df['y'] = reduced[:, 1] df['added_by'] = display_names df['cluster'] = cluster_labels df['name'] = [track['track']['name'] for track in tracks] df['artists'] = [', '.join(artist['name'] for artist in track['track']['artists']) for track in tracks] df['id'] = [track['track']['id'] for track in tracks]
float(v[23]), float(v[25]), float(v[27]), float(v[28]) ] iris_target.append(stateCode[str(v[4]).strip()]) iris_data.append(line_data) labels_true = np.array(iris_target) data = np.array(iris_data) n_sample = len(data) X = data[:, :2] # Incorrect number of clusters # ############################################################################# # Compute clustering with Means k_means = KMeans(init='k-means++', n_clusters=3, n_init=10) t0 = time.time() k_means.fit(X) t_batch = time.time() - t0 # ############################################################################# # Compute clustering with MiniBatchKMeans mbk = MiniBatchKMeans(init='k-means++', n_clusters=2, batch_size=batch_size, n_init=10, max_no_improvement=10, verbose=0) t0 = time.time() mbk.fit(X)
# -*- coding: utf-8 -*- """ Created on Thu Jun 6 15:36:36 2019 @author: KIIT """ import pandas as pd from sklearn.decomposition import PCA from sklearn.cluster import KMeans import matplotlib.pyplot as plt df=pd.read_csv('crime_data.csv') features=df.iloc[:,[1,2,4]].values pca=PCA(n_components=2) features=pca.fit_transform(features) kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 0) pred_cluster = kmeans.fit_predict(features) plt.scatter(features[pred_cluster == 0, 0], features[pred_cluster == 0, 1], c = 'blue', label = 'LowCrime') plt.scatter(features[pred_cluster == 1, 0], features[pred_cluster == 1, 1], c = 'red', label = 'MedCrime') plt.scatter(features[pred_cluster == 2, 0], features[pred_cluster == 2, 1], c = 'green', label = 'HighCrime') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c = 'yellow', label = 'Centroids') plt.title('Crime Data') plt.xlabel('P1 Features') plt.ylabel('P2 Features') plt.legend() plt.show()
def kmeanspp(X, k): kmeans = KMeans(n_clusters=k, max_iter=1, init='k-means++', n_init=1).fit(X) return kmeans.cluster_centers_
digits_test = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None) X_train = digits_train[numpy.arange(64)] y_train = digits_train[64] print(X_train) print(y_train) print('-----------------------------------------') X_test = digits_test[numpy.arange(64)] y_test = digits_test[64] # 初始化 KMeans模型, 并设置聚类中心数量为10 kmeans = KMeans(n_clusters=10) kmeans.fit(X_train) # 逐条判断每个测试图像所属的聚类中心 y_predict = kmeans.predict(X_test) # 0.6592893679369013, 0.6621773801044615 print(metrics.adjusted_rand_score(y_test, y_predict)) plt.subplot(3, 2, 1) x1 = numpy.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9]) x2 = numpy.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3]) print(x1) print(x2)
# display first 5 rows matrix.head() # Code ends here # -------------- # import packages from sklearn.cluster import KMeans # Code starts here # initialize KMeans object cluster = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10, random_state=0) # create 'cluster' column matrix['cluster'] = cluster.fit_predict(matrix[matrix.columns[1:]]) matrix.head() # Code ends here # -------------- # import packages from sklearn.decomposition import PCA # Code starts here # initialize pca object with 2 components
def forgy(X, k): kmeans = KMeans(n_clusters=k, max_iter=1, init='random', n_init=1).fit(X) return kmeans.cluster_centers_
# retro_clustering_algo = AgglomerativeClustering(n_clusters=args.clusters, connectivity=proximity) # retro_cluster_ids = clustering_algo.fit_predict(X=new_vectors) # do agglomerative clustering with structure print('agglomerative clustering', file=sys.stderr, ) clustering_algo = AgglomerativeClustering(n_clusters=args.clusters, connectivity=proximity, affinity=args.affinity, linkage=args.linkage) cluster_ids = clustering_algo.fit_predict(X=vectors) color_names = [cluster_colors[c] for i, c in enumerate(cluster_ids) if locations[eligible_cities[i]][-2] == "DE"] cMap = colors.ListedColormap(cluster_colors) print('done', file=sys.stderr, ) # do kmeans clustering print('kmeans clustering', file=sys.stderr, ) dumb_cluster_ids = [] for x in range(KMEANS_AVG): dumb_cluster = KMeans(n_jobs=-1, n_clusters=args.clusters) dumb_cluster_ids.append(dumb_cluster.fit_predict(vectors)) # dumb_cluster_ids = dumb_cluster.fit_predict(X_train_tfidf) print('done', file=sys.stderr, ) if args.show_nuts: NUTS_shape_file = '/Users/dirkhovy/Dropbox/working/lowlands/GeoStats/data/nuts/NUTS_RG_03M_2010.shp' print("reading country outline from %s" % NUTS_shape_file, end=' ', file=sys.stderr) NUTS_shapes = fiona.open(NUTS_shape_file) NUTS2_outlines = {} NUTS3_outlines = {} for item in islice(NUTS_shapes, None): nuts_id = None if item['properties']['STAT_LEVL_'] == 2:
# # Import KMeans from sklearn.cluster import KMeans # Create a KMeans instance with 3 clusters: model model = KMeans(n_clusters=3) # Fit model to points model.fit(points) # Determine the cluster labels of new_points: labels labels = model.predict(new_points) # Print cluster labels of new_points print(labels) # Import pyplot import matplotlib.pyplot as plt # Assign the columns of new_points: xs and ys xs = new_points[:,0] ys = new_points[:,1] # Make a scatter plot of xs and ys, using labels to define the colors plt.scatter(xs,ys,c=labels,alpha=0.5) # Assign the cluster centers: centroids centroids = model.cluster_centers_ # Assign the columns of centroids: centroids_x, centroids_y centroids_x = centroids[:,0]
import numpy as np from sklearn.cluster import KMeans def extractFeatures(filename): features = [] features = np.array([features]) for line in file(filename): row = line.split(',') features = np.append(features, np.array([float(x) for x in row[0:5]])) return np.array(features, dtype=int) filename1 = "/home/tharindra/PycharmProjects/WorkBench/DataMiningAssignment/LabelingBeforeClustering.csv" features = extractFeatures(filename1) features = features.reshape(4149, 5) kmeans = KMeans(n_clusters=3) kmeans.fit(features) centroids = kmeans.cluster_centers_ label = kmeans.labels_ for i in range(len(features)): #print("coordinate:",features[i], "label:", label[i]) print(label[i])
#Get Max and Min for exercised_stock_options dframe = pd.DataFrame.from_dict(data_dict , orient='index') dframe_filtered = dframe[dframe[feature_1] != "NaN"] print ("max is %s" % dframe_filtered[feature_1].max()) print ("min is %s" % dframe_filtered[feature_1].min()) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2, f3 in finance_features: plt.scatter( f1, f2, f3 ) plt.show() ### cluster here; careate predictions of the cluster labels ### for the data and store them to a list called pred from sklearn.cluster import KMeans clf = KMeans(n_clusters=3, random_state=0) pred = clf.fit_predict( finance_features ) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters3points.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: print "no predictions object named pred found, no clusters to plot"
def test_using_sklearn(label_true, label_true_test, dataset, datatest): X = numpy.array(dataset) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) cluster_train = kmeans.labels_ arr_test = numpy.array(datatest) cluster_test = kmeans.predict(arr_test) # Evaluation for Full Training print( "\n------------------------ SCIKIT LEARN --------------------------------" ) print( "--------------- K-MEANS SCORE USING DATA TRAIN -----------------------" ) print("ARI SCORE: " + str( adjusted_rand_score(numpy.array(label_true), numpy.array( cluster_train)))) print("MUTUAL INFO SCORE: " + str( adjusted_mutual_info_score(numpy.array(label_true), numpy.array(cluster_train)))) print("HOMOGENEITY SCORE: " + str( homogeneity_score(numpy.array(label_true), numpy.array(cluster_train))) ) print("COMPLETENESS SCORE: " + str( completeness_score(numpy.array(label_true), numpy.array( cluster_train)))) print("V MEASURE SCORE: " + str( v_measure_score(numpy.array(label_true), numpy.array(cluster_train)))) print("FOWLKES-MALLOWS SCORE: " + str( fowlkes_mallows_score(numpy.array(label_true), numpy.array(cluster_train)))) # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true), metric="euclidean"))) print("CALINSKI-HARABAZ SCORE: " + str( calinski_harabaz_score(numpy.array(dataset), numpy.array(label_true)))) # Evaluation for Split Validation print( "--------------- K-MEANS SCORE USING DATA TEST -----------------------" ) print("ARI SCORE: " + str( adjusted_rand_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("MUTUAL INFO SCORE: " + str( adjusted_mutual_info_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("HOMOGENEITY SCORE: " + str( homogeneity_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("COMPLETENESS SCORE: " + str( completeness_score(numpy.array(label_true_test), numpy.array(cluster_test)))) print("V MEASURE SCORE: " + str( v_measure_score(numpy.array(label_true_test), numpy.array( cluster_test)))) print("FOWLKES-MALLOWS SCORE: " + str( fowlkes_mallows_score(numpy.array(label_true_test), numpy.array(cluster_test)))) # print("SILHOUETTE SCORE: " + str(silhouette_score(numpy.array(dataset), numpy.array(label_true_test), metric="euclidean"))) print("CALINSKI-HARABAZ SCORE: " + str( calinski_harabaz_score(numpy.array(datatest), numpy.array(label_true_test)))) return None
def _get_masks(self,output, utt_info): '''estimate the masks Args: output: the output of a single utterance of the neural network tensor of dimension [Txfeature_dimension*emb_dim] Returns: the estimated masks''' embeddings = output['bin_emb'] noise_filter = output['noise_filter'] #only the non-silence bins will be used for the clustering mix_to_mask, _ = self.mix_to_mask_reader(self.pos) [T,F] = np.shape(mix_to_mask) emb_dim = np.shape(embeddings)[1]/F N = T*F if np.shape(embeddings)[0] != T: raise 'Number of frames in usedbins does not match the sequence length' if np.shape(noise_filter)[0] != T: raise 'Number of frames in usedbins does not match the sequence length' if np.shape(noise_filter)[1] != F: raise 'Number of noise filter outputs does not match number of frequency bins' #reshape the outputs emb_vec = embeddings[:T,:] emb_vec_resh = np.reshape(emb_vec,[T*F,emb_dim]) X_hat_clean = np.multiply(mix_to_mask,noise_filter[:T,:]) maxbin = np.max(X_hat_clean) floor=maxbin/self.usedbin_threshold #apply floor to get the used bins usedbins=np.greater(X_hat_clean,floor) noise_filter_reshape = np.reshape(noise_filter[:T,:],[T*F,1]) usedbins_resh = np.reshape(usedbins, T*F) #Only keep the active bins (above threshold) for clustering output_speech_resh = emb_vec_resh[usedbins_resh] # dim:N' x embdim (N' is number of bins that are used N'<N) if np.shape(output_speech_resh)[0] < 2: print 'insufficient bins with energie' return np.zeros([self.nrS,T,F]) #apply kmeans clustering and assign each bin to a clustering kmeans_model=KMeans(n_clusters=self.nrS, init='k-means++', n_init=10, max_iter=100, n_jobs=-1) for _ in range(5): # Sometime it fails due to some indexerror and I'm not sure why. Just retry then. max 5 times try: kmeans_model.fit(output_speech_resh) except IndexError: continue break A = kmeans_model.cluster_centers_ # dim: nrS x embdim prod_1 = np.matmul(A,emb_vec_resh.T) # dim: nrS x N numerator = np.exp(prod_1-np.max(prod_1,axis=0)) denominator = np.sum(numerator,axis=0) M = numerator/denominator M_final = np.multiply(M,np.transpose(noise_filter_reshape)) #reconstruct the masks from the cluster labels masks = np.reshape(M_final,[self.nrS,T,F]) np.save(os.path.join(self.center_store_dir,utt_info['utt_name']),kmeans_model.cluster_centers_) return masks
sns.show() #data_plot(df) df[df['Grad.Rate'] > 100] df['Grad.Rate']['Cazenovia College'] = 100 df[df['Grad.Rate'] > 100] sns.set_style('darkgrid') g = sns.FacetGrid(df, hue="Private", palette='coolwarm', size=6, aspect=2) g = g.map(plt.hist, 'Grad.Rate', bins=20, alpha=0.7) kmeans = KMeans(n_clusters=2) kmeans.fit(df.drop('Private', axis=1)) kmeans.cluster_centers_ def converter(cluster): if cluster == 'Yes': return 1 else: return 0 df['Cluster'] = df['Private'].apply(converter)
import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.datasets import make_blobs plt.figure(figsize=(12, 12)) n_samples = 1500 random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) # Incorrect number of clusters y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X) plt.subplot(221) plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.title("Incorrect Number of Blobs") # Anisotropicly distributed data transformation = [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]] X_aniso = np.dot(X, transformation) y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_aniso) plt.subplot(222) plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred) plt.title("Anisotropicly Distributed Blobs") # Different variance X_varied, y_varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
def fit(self, data, n_clusters): data = np.array(data) data = preprocessing.MinMaxScaler().fit_transform(data) model = KMeans(n_clusters=n_clusters) clustering = model.fit(data) return clustering
def document_clustering(year): """ Cluster the documents of the year given as a parameter. -------------------- Parameter: year: the year of interest Return: None """ #preprocess(year,year) #query_docs didn't work(Memory error) so I wrote quite similar code below #reports = query_docs(2013, 2014) #Create list of reports reports = [] #Create list of year directory's reports companies = os.listdir('cleaned' + os.sep + str(year)) #The command above inserted some "DS.store"-string in the beginning, so I remove it companies.remove(companies[0]) #Create list of selected companies company = [] # amount_of_files = 100 for i in range(amount_of_files): # Open the report with open('cleaned/' + str(year) + '/' + companies[i], 'r') as file: data = file.read().replace('\n', '') # Append report to the list reports.append(data) #Append selected company to another list company.append(companies[i]) #tf-idf vectorizer = CountVectorizer() X = vectorizer.fit_transform(reports) transformer = TfidfTransformer(smooth_idf=False) tfidf = transformer.fit_transform(X) #K-means clustering num_clusters = 5 km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1) km.fit(tfidf) clusters = km.labels_.tolist() idea = { 'Filename': company, 'Cluster': clusters } #Creating dict having report's filename with the corresponding cluster number. frame = pd.DataFrame(idea, index=[clusters], columns=['Filename', 'Cluster' ]) # Converting it into a dataframe. #Printing the results for i in range(num_clusters): print("Cluster" + str(i + 1) + ":") cluster_i = frame.loc[[i]] fra = cluster_i['Filename'].tolist() for i in fra: print(i)
detectors = {'sift': cv2.xfeatures2d.SIFT_create, 'surf': cv2.xfeatures2d.SURF_create, 'orb': cv2.ORB_create} detector = detectors.get(sys.argv[2]) if sys.argv[1] == 'train': k = int(sys.argv[5]) print('|||||||| First Class ||||||||') im_class0 = ImagesManager(sys.argv[3], detector()) print('|||||||| Second Class ||||||||') im_class1 = ImagesManager(sys.argv[4], detector()) data = [] for d in im_class0.__get_all_descriptors__(): data.append(d) for d in im_class1.__get_all_descriptors__(): data.append(d) print('|||||||| Kmeans ||||||||') kmeans = KMeans(n_clusters=k, random_state=0).fit(data) print('|||||||| First Class ||||||||') im_class0.__compute_bows__(kmeans, k, True) print('|||||||| Second Class ||||||||') im_class1.__compute_bows__(kmeans, k, True, sum(nb_d for nb_d in im_class0.number_descriptors)) logistic = LogisticRegression() labels = [] for i in range(0, len(im_class0.files)): labels.append(0) for i in range(0, len(im_class1.files)): labels.append(1) bows = [] for b in im_class0.__get_bows__(): bows.append(b) for b in im_class1.__get_bows__():
test_standardscal = standard_scaler.fit(train).transform(test) def compute_error(label,predict): label_list = label.transpose().tolist() count = 0 for i in range(len(predict)): if label_list[i] == predict[i]: count += 1 error = 1-((count / len(predict))) return '{:.4f} '.format(error) scaler = MinMaxScaler() train_scal = scaler.fit(train).transform(train) train_log = np.log(train_scal) ##kmeans ================================================== kmeans = KMeans(n_clusters = 2).fit(train_standardscal) pred_kmeans = kmeans.labels_ import collections collections.Counter(pred_kmeans) compute_error(label,pred_kmeans) kmeans_pred = kmeans.predict(test_standardscal) #========================================================== train_scale = scale(train) test_scale = scale(test) label = train_label[:,0] # 'reg_lambda':[1,2,3,4,5], cv_params = {'n_estimators':[1300,1100]} ind_params = { 'seed':0, 'subsample':0.7, 'min_child_weight':3,
import matplotlib.pyplot as plt from sklearn.datasets import make_blobs attributes, clusters = make_blobs(cluster_std=1) plt.scatter(attributes[:, 0], attributes[:, 1], c=clusters) plt.show() from sklearn.cluster import KMeans attributes, clusters = make_blobs() # Better n_init to be large! k_means = KMeans(3, init="random", n_init=10) assigned = k_means.fit_predict(attributes) # Original, generated clusters plt.scatter(attributes[:, 0], attributes[:, 1], c=clusters) plt.show() # Assigned clusters plt.scatter(attributes[:, 0], attributes[:, 1], c=assigned) plt.show() k_means = KMeans(3, init="k-means++") assigned = k_means.fit_predict(attributes) assigned = k_means.fit_predict(attributes) # Original, generated clusters
import numpy as np import matplotlib.pyplot as plt # Though the following import is not directly being used, it is required # for 3D projection to work from mpl_toolkits.mplot3d import Axes3D from sklearn.cluster import KMeans from sklearn import datasets np.random.seed(5) iris = datasets.load_iris() X = iris.data y = iris.target estimators = [('k_means_iris_8', KMeans(n_clusters=8)), ('k_means_iris_3', KMeans(n_clusters=3)), ('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1, init='random'))] fignum = 1 titles = ['8 clusters', '3 clusters', '3 clusters, bad initialization'] for name, est in estimators: fig = plt.figure(fignum, figsize=(4, 3)) ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) est.fit(X) labels = est.labels_ ax.scatter(X[:, 3], X[:, 0], X[:, 2],
def call_KM(genre1, genre2, genre3): movies = pd.read_csv('mysite/movies.csv') ratings = pd.read_csv('mysite/ratings.csv') # genre1='Adventure' # genre2='Sci-Fi' # genre3='Action' my_clusters = 0 helper.set_Variables(genre1, genre2, genre3) genre_ratings = helper.get_genre_ratings(ratings, movies, [genre1, genre2], [Dict[genre1], Dict[genre2]]) biased_dataset = helper.bias_genre_rating_dataset(genre_ratings, 3.2, 2.5) print("Number of records: ", len(biased_dataset)) biased_dataset.head() helper.draw_scatterplot(biased_dataset[Dict[genre2]], Dict[genre2], biased_dataset[Dict[genre1]], Dict[genre1], 'mysite/static/mysite/Normal.png') # plt.savefig('mysite/static/mysite/Normal.png') # # plt.close('mysite/static/mysite/Normal.png') X = biased_dataset[[Dict[genre2], Dict[genre1]]].values # TODO: Create an instance of KMeans to find two clusters kmeans_1 = KMeans(n_clusters=2, random_state=0) predictions = kmeans_1.fit_predict(X) helper.draw_clusters(biased_dataset, predictions, 'mysite/static/mysite/TwoCluster.png') # plt.savefig('mysite/static/mysite/TwoCluster.png') # plt.close('TwoCluster.png') # TODO: Create an instance of KMeans to find three clusters kmeans_2 = KMeans(n_clusters=3, random_state=1) predictions_2 = kmeans_2.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_2, 'mysite/static/mysite/ThreeCluster.png') # plt.savefig('mysite/static/mysite/ThreeCluster.png') # plt.close('ThreeCluster.png') # TODO: Create an instance of KMeans to find four clusters kmeans_3 = KMeans(n_clusters=4, random_state=3) predictions_3 = kmeans_3.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_3, 'mysite/static/mysite/FourCluster.png') # plt.savefig('mysite/static/mysite/FourCluster.png') # plt.close('FourCluster.png') possible_k_values = range(2, len(X) + 1, 5) errors_per_k = [helper.clustering_errors(k, X) for k in possible_k_values] list(zip(possible_k_values, errors_per_k)) fig, ax = plt.subplots(figsize=(16, 6)) ax.set_xlabel('K - number of clusters') ax.set_ylabel('Silhouette Score (higher is better)') ax.plot(possible_k_values, errors_per_k) fig.savefig('mysite/static/mysite/score.png') plt.close(fig) # Ticks and grid xticks = np.arange(min(possible_k_values), max(possible_k_values) + 1, 5.0) ax.set_xticks(xticks, minor=False) ax.set_xticks(xticks, minor=True) ax.xaxis.grid(True, which='both') yticks = np.arange(round(min(errors_per_k), 2), max(errors_per_k), .05) ax.set_yticks(yticks, minor=False) ax.set_yticks(yticks, minor=True) ax.yaxis.grid(True, which='both') # TODO: Create an instance of KMeans to find seven clusters kmeans_4 = KMeans(n_clusters=7, random_state=6) predictions_4 = kmeans_4.fit_predict(X) helper.draw_clusters(biased_dataset, predictions_4, 'mysite/static/mysite/BestCluster.png', cmap='Accent') # plt.savefig('mysite/static/mysite/BestCluster.png') # plt.close('BestCluster.png') biased_dataset_3_genres = helper.get_genre_ratings( ratings, movies, [genre1, genre2, genre3], [Dict[genre1], Dict[genre2], Dict[genre3]]) biased_dataset_3_genres = helper.bias_genre_rating_dataset( biased_dataset_3_genres, 3.2, 2.5).dropna() print("Number of records: ", len(biased_dataset_3_genres)) X_with_action = biased_dataset_3_genres[[ Dict[genre2], Dict[genre1], Dict[genre3] ]].values # TODO: Create an instance of KMeans to find seven clusters kmeans_5 = KMeans(n_clusters=7) predictions_5 = kmeans_5.fit_predict(X_with_action) helper.draw_clusters_3d(biased_dataset_3_genres, predictions_5, 'mysite/static/mysite/3DCluster.png') # plt.savefig('mysite/static/mysite/3DCluster.png') # plt.close('3DCluster.png') #Merge the two tables then pivot so we have Users X Movies dataframe ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId') user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns='title', values='rating') user_movie_ratings.iloc[:6, :10] n_movies = 30 n_users = 18 most_rated_movies_users_selection = helper.sort_by_rating_density( user_movie_ratings, n_movies, n_users) most_rated_movies_users_selection.head() helper.draw_movies_heatmap(most_rated_movies_users_selection, 'mysite/static/mysite/HeatMap.png')
elif classify == 3: cLabel = 'SVM' clf = SVC() elif classify == 4: cLabel = 'Linear Discriminant Analysis' clf = LDA() elif classify == 5: cLabel = 'Random Forest Classifier' clf = RandomForestClassifier(n_estimators=5) #SVR(C = 1.0, epsilon=0.2) elif classify == 6: cLabel = 'K-means clustering' clf = KMeans(n_clusters=512, init='random') t0 = time.time() clf.fit(train_instances, train_labels) t1 = time.time() nd = len(use_idx) # prediction on training and test data accuracyTr, dev_acc_train, predicted_labels_binary_train = deviceErrors( clf, nd, train_instances, train_labels, train_labels_binary) accuracyTs, dev_acc_test, predicted_labels_binary_test = deviceErrors( clf, nd, test_instances, test_labels, test_labels_binary) # prediction of device energy consumption agg_energy_train = train_instances[:, 5] actEnergy_train = actDevEnergy(device_power, device_timer, nd)
from sklearn.cluster import KMeans import cPickle as pickle from time import time import numpy as np if __name__ == "__main__": with open("../lesson10/dataset.pickle", "rb") as f: X = np.load(f) print "shape of dataset:", X.shape km = KMeans(init='k-means++', n_clusters=500, verbose=1) t0 = time() km.fit(X) print "done in %0.3fs" % (time() - t0) with open("km.pickle", "wb") as f: pickle.dump(km, f, pickle.HIGHEST_PROTOCOL)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn import preprocessing from sklearn.cluster import KMeans from sklearn.mixture import GaussianMixture from sklearn.metrics import confusion_matrix #load data and assign headers iris = load_iris() x = pd.DataFrame(iris.data) x.columns = ['Sepal-Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'] #fit kmeans model kmeans = KMeans(n_clusters=3) kmodel = kmeans.fit(x) #fit em model gmm = GaussianMixture(n_components=3) gmm.fit(x) gmm_labels = gmm.predict(x) #print confusion matrices for both classifications print("Kmeans algorithm:\n ", confusion_matrix(iris.target, kmodel.labels_)) print("\nEM algorithm:\n ", confusion_matrix(iris.target, gmm_labels)) #print scatter plots for iris target clusters and kmeans-em classifications colormap = np.array(['red', 'blue', 'green']) plt.subplot(2, 2, 1) plt.scatter(x.Petal_Length, x.Petal_Width, c=colormap[iris.target], s=40)