def cluster_features(features, n_clusters, implementation, faiss_gpu=False, max_iter=3000, random_state=None): if implementation == "sklearn": kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, max_iter=max_iter) kmeans.fit(features) print(f"Loss: {kmeans.inertia_}") return kmeans.cluster_centers_, kmeans.labels_ elif implementation == "faiss": kmeans = faiss.Kmeans(features.shape[1], n_clusters, niter=max_iter, gpu=faiss_gpu) kmeans.train(features) _, I = kmeans.index.search(features, 1) return kmeans.centroids, I.reshape(I.shape[0]) else: print(f"No such kmeans implementation {implementation} available.")
def _fit(self, num_iters=10): scores = [] start = time.time() for i in range(num_iters): print('Starting sklearn KMeans: %d' % i) sklearn_kmeans = SklearnKMeans(n_clusters=self.num_clusters, init='k-means++', max_iter=50, n_init=1, tol=1e-4, random_state=i * 42) sklearn_kmeans.train(self.points) scores.append(sklearn_kmeans.inertia_) self._report(num_iters, start, time.time(), scores)
def _fit(self, num_iters=10): scores = [] start = time.time() for i in range(num_iters): print('Starting sklearn KMeans: %d' % i) sklearn_kmeans = SklearnKMeans( n_clusters=self.num_clusters, init='k-means++', max_iter=50, n_init=1, tol=1e-4, random_state=i * 42) sklearn_kmeans.train(self.points) scores.append(sklearn_kmeans.inertia_) self._report(num_iters, start, time.time(), scores)
def standard_spark_kmeans(data, k, max_iter, random_state): t1 = time() from pyspark.mllib.clustering import KMeans from math import sqrt from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%10) sc = SparkContext(conf=conf) data = sc.parallelize(data) # Build the model (cluster the data) clusters = KMeans.train(data, k, maxIterations=max_iter, runs=10, initializationMode="random", seed=random_state, epsilon=1e-4) # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = data.map(lambda point: error(point)).reduce(lambda x, y: x + y) print time() - t1 print WSSSE
max_epochs = 100 vec_size = 20 alpha = 0.025 model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1) model.build_vocab(tagged_data) for epoch in range(max_epochs): print('iteration {0}'.format(epoch)) model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) # decrease the learning rate model.alpha -= 0.0002 # fix the learning rate, no decay model.min_alpha = model.alpha model.save("d2v.model") print("Model Saved") from gensim.models.doc2vec import Doc2Vec model = Doc2Vec.load("d2v.model") #to find the vector of a document which is not in training data test_data = word_tokenize("Sports".lower()) v1 = model.infer_vector(test_data)
scale_data = DataFrame.as_matrix(scale_data) pca = PCA(n_components=20) pca.fit(scale_data) #Use PCA to reduce the dimension transform_data = pca.transform(scale_data) data_array = transform_data DataFrame(transform_data).to_csv('temp.csv') #Using K-means to classify the data model = KMeans.train(sc.parallelize(data_array), 6, maxIterations=50, runs=30, initializationMode="random") data_cluster = np.array([]) for k in range(0, data_array.shape[0]): clusters = model.predict(data_array[k]) data_cluster = np.append(data_cluster, clusters) print 'data of row', k, 'is cluster:', clusters file_path = raw_data_total['file_path'] cluster_dataframe = DataFrame({ 'file_path': file_path, 'cluster': data_cluster })