def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True): print "ClusterBalancing..." indexesPicked = [] obs1 = self.observations[indexesToPick] obs = normalize(obs1, axis=0) if len(indexesToPick) != 0: if kmeansFlag: if(len(indexesToPick) < self.numClusters): cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10) else: cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10) else: if(len(indexesToPick) < self.numClusters): cluster = spectral_clustering(n_clusters=len(obs), n_init=10) else: cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10) cluster.fit(obs) labels = cluster.labels_ whenToStop = max(2, stopCount) count = 0 while count != whenToStop: cluster_list = range(self.numClusters) index = 0 for j in labels: if j in cluster_list: indexesPicked.append(indexesToPick[index]) cluster_list.remove(j) count += 1 if count == whenToStop: break labels[index] = -1 if len(cluster_list) == 0: break index += 1 return indexesPicked
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') data = np.loadtxt(args.data_points) if args.root is not None: data = np.sqrt(data) (k, initial_points) = get_initial_centers(args.clusters, args.start_points) log.info('calculate center points') kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False) predict = kmeans.fit_predict(data) log.info('storing results') if args.model: save_object_to_file(kmeans, args.model) with utf8_file_open(args.outfile, 'w') as outfile: for i in range(predict.shape[0]): outfile.write('%d\n' % predict[i]) if args.centroids: np.savetxt(args.centroids, kmeans.cluster_centers_) log.info('finished')
def performKmeans(data,n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) orb_cb_handler.store_estimator(est) return est
def start_algorithm(self): """ start clustering the stored tweets :return: list of clusters containing tweets """ vectors = self.vectorize_data() kmeans = KMeans(init='k-means++', n_clusters=self.cluster_amount, n_init=10) kmeans.fit(vectors) return self.cluster_tweet(kmeans.labels_)
def performKmeans(data,n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) labels = est.labels_ labels_np = np.array(labels) return labels,est
def evaluateKMeans(data, labels, nclusters, method_name): ''' Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers :param data: Points that need to be clustered as a numpy array :param labels: True labels for the given points :param nclusters: Total number of clusters :param method_name: Name of the method from which the clustering space originates (only used for printing) :return: Formatted string containing metrics and method name, cluster centers ''' kmeans = KMeans(n_clusters=nclusters, n_init=20) kmeans.fit(data) return getClusterMetricString(method_name, labels, kmeans.labels_), kmeans.cluster_centers_
def get_params(self, deep=True): """Overrides superclass get_params() to allow superclass hyperparameters to be returned as well. This allows for tuning any parameters from the parent KMeans class without having to list each parameter in the KMeansMM __init__() function. Taken from https://stackoverflow.com/questions/51430484/how-to-subclass-a-vectorizer-in-scikit-learn-without-repeating-all-parameters-in. Parameters ---------- deep : boolean, optional If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns ------- params : mapping of string to any Parameter names mapped to their values. """ params = super().get_params(deep) cp = copy.copy(self) cp.__class__ = KMeans params.update(KMeans.get_params(cp, deep)) return params
def initial_kmeans(k, rand_state, data, reallabels): min_clusters, max_clusters = k_range(k) # 根据真实类标签数得到实验所用的簇数量范围 bestAri_arr = [] # 每一个k簇值ari最好值的集合 # bestCr_arr = [] #每一个k簇值CR最好值的集合 kmeans_labels = [] # 某一k簇值得到的最好的划分 kmeans_labels_arr = [] # 每一个k簇值的最好划分的集合 for clusters in range(min_clusters, max_clusters): bestAri = 0 # 某一k簇值中的ari最好值 # bestCr = -1 for i in range(ini_generation): y_kmeans = KMeans(n_clusters=clusters, random_state=rand_state).fit_predict(data) kmeans_ari = adjusted_rand_score(reallabels, y_kmeans) # kmeans_cr = corrected_rand(reallabels, y_kmeans) if kmeans_ari > bestAri: bestAri = kmeans_ari kmeans_labels = y_kmeans # if kmeans_cr > bestCr: # bestCr = kmeans_cr # bestCr_arr.append(bestCr) bestAri_arr.append(bestAri) ind_kmeans = creator.Individual(kmeans_labels) kmeans_labels_arr.append(ind_kmeans) # print ('kmeans的最好CR值为:%s'%bestCr_arr) return kmeans_labels_arr, bestAri_arr
def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto'): self._hyperparams = { 'n_clusters': n_clusters, 'init': init, 'n_init': n_init, 'max_iter': max_iter, 'tol': tol, 'precompute_distances': precompute_distances, 'verbose': verbose, 'random_state': random_state, 'copy_x': copy_x, 'n_jobs': n_jobs, 'algorithm': algorithm } self._wrapped_model = SKLModel(**self._hyperparams)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def extract_word_clusters(commentList, commentCount): brown_ic = wordnet_ic.ic('ic-brown.dat') a, corpus, global_synsets = extract_global_bag_of_words(commentList, True) similarity_dict = {} i = 0 t = len(global_synsets)**2 for syn_out in global_synsets: similarity_dict[syn_out] = {} for syn_in in global_synsets: if syn_in.pos() == syn_out.pos(): similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic) else: similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out)) if i % 10000 == 0: print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)' i += 1 tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] vectors = [np.array(tup[1]) for tup in tuples] # Rule of thumb n = sqrt(len(global_synsets)/2) print "Number of clusters", n km_model = KMeans(n_clusters=n) km_model.fit(vectors) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(tuples[idx][0]) pprint.pprint(dict(clustering), width=1) feature_vector = np.zeros([len(corpus),n]) for i,comment in enumerate(corpus): for w in comment: for key, clust in clustering.items(): if w in clust: feature_vector[i][key] += 1 if i % 1000 == 0: print i, 'comments processed' print feature_vector '''
def evaluate_k_means_raw(data, true_labels, n_clusters, k_init): """ Clusters data with K-Means algorithm and then returns clustering accuracy and NMI :param data: Points that need to be clustered as a numpy array :param true_labels: True labels for the given points :param n_clusters: Total number of clusters :return: ACC, NMI """ # https://github.com/Datamine/MNIST-K-Means-Clustering/blob/master/Kmeans.ipynb # http://johnloeber.com/docs/kmeans.html # Llyod's Algorithm for K-Means Clustering kmeans = KMeans(n_clusters=n_clusters, n_init=k_init) kmeans.fit(data) acc = cluster_acc(true_labels, kmeans.labels_) nmi = metrics.normalized_mutual_info_score(true_labels, kmeans.labels_) return acc, nmi
def getClustering(method_name="k-mean", param_map={}): if method_name == "k-mean": from sklearn.cluster.k_means_ import KMeans return KMeans(**param_map) elif method_name == "dbscan": from sklearn.cluster import DBSCAN return DBSCAN(**param_map) return None
def getClusters(input_data): km = KMeans(n_clusters=10, random_state=0).fit(input_data) centers = km.cluster_centers_ np.insert( centers, 0, 1, 1 ) ####=========================== ADD BIAS =====================================## print("Centers : ", centers.shape) return centers
def runKMeans(distance_matrix, nClusters, number_of_threads): km = KMeans(n_clusters=nClusters, max_iter=100, init='k-means++', precompute_distances=True, n_jobs=number_of_threads) km.fit(distance_matrix) labels = km.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noises = list(labels).count(-1) print('Number of clusters' + str(n_clusters)) print('Number of noises' + str(n_noises)) return list(labels)
def train_k_means(n_clusters, init_type, x_array, y, eps, n_init): DIGIT_COUNT = 10 inertias = [] iterations = [] entropys = [] for i in range(n_init): # fill matrix by zero n_matrix = np.zeros((n_clusters, DIGIT_COUNT), dtype=np.int) if init_type == "random": init = "random" elif init_type == "k-away": init = get_k_away_centers(x_array, n_clusters) else: raise NotImplementedError clf = KMeans(init=init, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps) clf.fit(x_array) # Q value inertias.append(clf.inertia_) # iterations number iterations.append(clf.n_iter_) # labels for j in range(len(y)): digit = y[j] cluster = clf.labels_[j] n_matrix[cluster][digit] += 1 n = float(len(y)) # print "n_matrix = ", [v for v in n_matrix] Hyz = -reduce(lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0), [ n_matrix[cluster][digit] / n for cluster in range(n_clusters) for digit in range(DIGIT_COUNT) ], 0.0) Hz = -reduce( lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0), [sum(n_matrix[cluster], 0.0) / n for cluster in range(n_clusters)], 0.0) # print("Hyz = %s" % Hyz) # print("Hz = %s" % Hz) entropys.append(Hyz - Hz) return iterations, inertias, entropys
def trainmodel5(self, filename): df = pd.read_csv(filename) df.loc[:, 'animal'].replace(['sheep', 'cow'], [1, 2], inplace=True) df = df.drop(df[df['scale'] == 0].index) x = df.loc[:, 'animal':'age'] y = df.loc[:, 'weight'] self.x = x self.y = y self.data = df self.spliter(x, y) self.linier.fit(self.x, self.y) n_neighbors = 3 for i, weights in enumerate(['uniform', 'distance']): self.knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights) self.knn.fit(self.x, self.y) self.kmeans = KMeans(n_clusters=5) self.kmeans.fit(self.x)
def K_means_BERT(datasets, pred_vector, labels, opt): # datasets: a list ,each element is a [3,max_len] array sample # pred_vector: a model's function to predict embedding # num_classes: num of class num_classes = len(np.unique(labels)) feature_embeddings = model_pred_BERT(datasets, pred_vector, labels, opt) kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings) label_list = kmeans.labels_.tolist() return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
def create_train_kmeans(data, number_of_clusters=len(codes)): # n_jobs is set to -1 to use all available CPU cores. This makes a big difference on an 8-core CPU # especially when the data size gets much bigger. #perfMatters k = KMeans(n_clusters=number_of_clusters, n_jobs=-1, random_state=728) # Let's do some timings to see how long it takes to train. start = time.time() # Train it up k.fit(data) # Stop the timing end = time.time() # And see how long that took print("Training took {} seconds".format(end - start)) return k
def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
def test_cifar10(): (X_train, y_train), (X_test, y_test) = cifar10.load_data() X_train = X_train.reshape((50000, 32*32*3)) X_test = X_test.reshape((10000, 32*32*3)) y_train = y_train.reshape((50000)) y_test = y_test.reshape((10000)) distortions = [] X_ = X_test[y_test==4] K = range(1,30) for k in K: kmeanModel = KMeans(n_clusters=k, random_state=84) kmeanModel.fit(X_) distortions.append(sum(np.min(cdist(X_, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_.shape[0]) # Plot the elbow plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show() #alg = LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84) #alg.fit(X_train, y_train) #y_pred = alg.predict(X_test) #score = accuracy_score(y_test, y_pred) #print(score) pl = PluralizatorClassifier( LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84), 'k-means', { 0:3, 1:3, 2:3, 3:3, 4:3, 5:3, 6:3, 7:3, 8:3, 9:3 }, random_state=84, n_jobs=-1) pl.fit(X_train, y_train) y_pred = pl.predict(X_test) score = accuracy_score(y_test, y_pred) print(score) return
def initial_kmeans(k, rand_state, data): min_clusters, max_clusters = k_range(k) # 根据真实类标签数得到实验所用的簇数量范围 kmeans_labels_arr = [] # 每一个k簇值的最好划分的集合 for clusters in range(min_clusters, max_clusters): kmeans_labels = KMeans(n_clusters=clusters, random_state=rand_state).fit_predict(data) ind_kmeans = creator.Individual(kmeans_labels) kmeans_labels_arr.append(ind_kmeans) return kmeans_labels_arr
def main(): predicted_labelAll = [] datamat, datalabels = loadDataset("../dataset/iris.data") print 'data ready' nmi_max = -inf ari_max = -inf for i in range(10): clusters = random.randint(2, 11) predicted_label = KMeans(n_clusters=clusters).fit_predict(datamat) predicted_label = predicted_label.tolist() nmi = normalized_mutual_info_score(datalabels, predicted_label) ari = adjusted_rand_score(datalabels, predicted_label) if nmi > nmi_max: nmi_max = nmi if ari > ari_max: ari_max = ari print('nmi值为:') print(nmi_max) print('ari值为:') print(ari_max)
def simulation(n, n_clusters, k_range, dim, runs=100): all_data = [] k_low, k_hi = k_range for idx in range(runs): data, labels = make_blobs(n_samples=n, n_features=dim, centers=n_clusters, cluster_std=0.1, center_box=(-1.0, 1.0)) for k in range(k_low, k_hi + 1): # Get a model specified, fit to data, score for error, mark error as -1 if fails model = KMeans(n_clusters=k, random_state=0) labels = model.fit_predict(data) avg_score = silhouette_score(data, labels) all_data.append([n, n_clusters, k, dim, avg_score]) df = pd.DataFrame(all_data, columns=['n', 'n_clusters', 'k', 'dim', 'avg_score']) return df
def run_kmeans(data,label,k=3,fname="../results/kmeans"): if len(data) < k: return vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,stop_words='english', use_idf=True) clean_data = get_clean_data(data) X = vectorizer.fit_transform(clean_data) km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) km.fit(X) print label,np.bincount(km.labels_) assert len(km.labels_) == len(data) f = open(fname+str(int(label))+".csv",'w') f.write("subject\tbody\tcluster_id\n") for i in range(len(data)): subject,body = data[i] subject = " ".join(str(subject).split()) body = " ".join(str(body).split()) cluster_id = str(km.labels_[i]) row = data[i] f.write(subject+"\t"+body+"\t"+cluster_id+'\n') f.close()
def bisection(max_k: int, data: np.ndarray) -> tree_node: current_k = 1 data_centroid = np.mean(data, 0) root = tree_node(0, data_centroid) root_sse = sum_square_error(data_centroid, data) next_split_order = 1 next_node_id = 1 queue = PriorityQueue() queue.put((-1.0 * root_sse, root, data)) # print(f"rootsse {root.sse}") while current_k < max_k: _, leaf_to_split, split_data = queue.get() # print(f"leaf_to_split sse {leaf_to_split.sse}") leaf_to_split.split_order = next_split_order next_split_order += 1 k = KMeans(2) labels = np.array(k.fit_predict(split_data), dtype=np.float32) labels = labels.reshape([len(labels), 1]) left_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 0]) left_data = split_data[left_idx, :] left_child = tree_node(next_node_id, np.mean(left_data, 0)) next_node_id += 1 leaf_to_split.left_child = left_child queue.put((-1.0 * sum_square_error(left_child.centroid, left_data), left_child, left_data)) # print(f"left_child sse {left_child.sse}") right_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 1]) right_data = split_data[right_idx, :] right_child = tree_node(next_node_id, np.mean(right_data, 0)) next_node_id += 1 leaf_to_split.right_child = right_child queue.put((-1.0 * sum_square_error(right_child.centroid, right_data), right_child, right_data)) # print(f"right_child sse {right_child.sse}") current_k += 1 # it is only one leaf node more _assign_leaf_ids(root) return root
def train_k_means_by_step(n_clusters, init_cluster_centers, x_array, eps): # eps = 1e-4 # eps = 0.1 # eps = 100.0 # prev_sample = np.array(clf.cluster_centers_, np.float) prev_centers = init_cluster_centers clf = KMeans(init=prev_centers, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps, max_iter=1) # if isinstance(prev_centers, str): # prev_centers = clf.cluster_centers_ clf.fit(x_array) new_centers = clf.cluster_centers_ centers_list = [prev_centers, new_centers] args = [1] values = [clf.inertia_] while get_distance(prev_centers, new_centers) > eps: prev_centers = new_centers clf = KMeans(init=prev_centers, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps, max_iter=1).fit(x_array) new_centers = clf.cluster_centers_ args.append(len(args) + 1) values.append(clf.inertia_) centers_list.append(new_centers) # print "k = %s, len centers = %s" % (n_clusters, len(f_values)) return args, values, centers_list
def plot_job_cluster(n_clusters, no_jobs, subset, kmeans=None): if not kmeans: kmeans = KMeans(n_clusters=n_clusters) job_predict = kmeans.fit_predict(subset) plot_california_counties() for i in range(n_clusters): mean_jobs = np.mean( [no_jobs[j] for j in range(len(no_jobs)) if job_predict[j] == i]) plt.scatter( [subset[j][0] for j in range(len(subset)) if job_predict[j] == i], [subset[j][1] for j in range(len(subset)) if job_predict[j] == i], label=f"Mean No. Jobs:{mean_jobs:.0f}", s=4.5) # city_labels() plt.legend() plt.gca().set_xlabel("Longitude") plt.gca().set_ylabel("Latitude") plt.xlim((-120, -116)) plt.ylim((33, 35)) plt.axis('equal') plt.show()
def test_KMeansConstrained_parity_digits(): iris = datasets.load_iris() X = iris.data k = 8 random_state = 1 size_min, size_max = None, None # No restrictions and so should produce same result clf_constrained = KMeansConstrained(size_min=size_min, size_max=size_max, n_clusters=k, random_state=random_state, init='k-means++', n_init=10, max_iter=300, tol=1e-4) y_constrained = clf_constrained.fit_predict(X) # TODO: Testing scikit-learn has be set to v0.19. This is because there is a discrepancy scikit-learn v0.22 https://github.com/scikit-learn/scikit-learn/issues/16623 clf_kmeans = KMeans(n_clusters=k, random_state=random_state, init='k-means++', n_init=10, max_iter=300, tol=1e-4) y_kmeans = clf_kmeans.fit_predict(X) # Each cluster should have the same number of datapoints assigned to it constrained_ndp = pd.Series(y_constrained).value_counts().values kmeans_ndp = pd.Series(y_kmeans).value_counts().values assert_almost_equal(constrained_ndp, kmeans_ndp) # Sort the cluster coordinates (otherwise in a random order) constrained_cluster_centers = sort_coordinates( clf_constrained.cluster_centers_) kmean_cluster_centers = sort_coordinates(clf_kmeans.cluster_centers_) assert_almost_equal(constrained_cluster_centers, kmean_cluster_centers)
def run_kmeans(data, label, k=3, fname="../results/kmeans"): if len(data) < k: return vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, stop_words='english', use_idf=True) clean_data = get_clean_data(data) X = vectorizer.fit_transform(clean_data) km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) km.fit(X) print label, np.bincount(km.labels_) assert len(km.labels_) == len(data) f = open(fname + str(int(label)) + ".csv", 'w') f.write("subject\tbody\tcluster_id\n") for i in range(len(data)): subject, body = data[i] subject = " ".join(str(subject).split()) body = " ".join(str(body).split()) cluster_id = str(km.labels_[i]) row = data[i] f.write(subject + "\t" + body + "\t" + cluster_id + '\n') f.close()
def K_means(datasets, pred_vector, num_classes, opt): ''' Args: datasets: a list ,each element is a [3,max_len] array sample pred_vector: a model's function to predict embedding num_classes: num of class Returns: K-means results -- a tuple(label_list, message, cluster_centers, features) ''' feature_embeddings = model_pred(datasets, pred_vector, opt) kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings) label_list = kmeans.labels_.tolist() return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
def perLabel(label_name, labels, sample_size, n_clusters): print(79 * '_') print label_name print( '% 9s' % 'feature' ' time inertia h**o compl v-meas ARI AMI silhouette') #print "number of distinct classes for true labels for ",label_name, len(Counter(labels)) estimator = KMeans(n_clusters=n_clusters) bench_k_means(labels, sample_size, estimator, "RGB", rgb_data) bench_k_means(labels, sample_size, estimator, "LAB", lab_data) bench_k_means(labels, sample_size, estimator, "HOG", hog_data) bench_k_means(labels, sample_size, estimator, "GIST", gist_data) bench_k_means(labels, sample_size, estimator, "SURF", surf_data) bench_k_means(labels, sample_size, estimator, "SIFT", sift_data) bench_k_means(labels, sample_size, estimator, "ORB", orb_data)
def get_data_for_kl_loss(self, encode_output, label_list, n_clusters): """ returns centroids for KL-divergence loss :param encode_output: encoder output :param label_list: labels for the encoder output :param n_clusters: number of clusters :return: centroids """ # if self.use_cuda is False: # data = np.copy(encode_output.data) # label = np.copy(label_list.data) # else: # data = np.copy(encode_output.data.cpu()) # label = np.copy(label_list.data.cpu()) data = encode_output data_len = len(data) if data_len < n_clusters: n_clusters = data_len kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=self.k_init) # Fitting the input data kmeans.fit(data) # Centroid values centroids = kmeans.cluster_centers_ if self.use_cuda: return Variable(torch.from_numpy(centroids).float().cuda()) return Variable(torch.from_numpy(centroids).float())
def compute_bench(samples_range, features_range): it = 0 iterations = 200 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '==============================' print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' data = nr.random_integers(-50, 50, (n_samples, n_features)) print 'K-Means' tstart = time() kmeans = KMeans(init='k-means++', k=10).fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %0.5f" % kmeans.inertia_ print '' results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print 'Fast K-Means' # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', k=10, chunk_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print "Speed: %0.3fs" % delta print "Inertia: %f" % mbkmeans.inertia_ print '' print '' results['minibatchkmeans_speed'].append(delta) results['minibatchkmeans_quality'].append(mbkmeans.inertia_) return results
def compute_bench(samples_range, features_range): it = 0 results = defaultdict(lambda: []) chunk = 100 max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('==============================') print('Iteration %03d of %03d' % (it, max_it)) print('==============================') print() data = nr.randint(-50, 51, (n_samples, n_features)) print('K-Means') tstart = time() kmeans = KMeans(init='k-means++', n_clusters=10).fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %0.5f" % kmeans.inertia_) print() results['kmeans_speed'].append(delta) results['kmeans_quality'].append(kmeans.inertia_) print('Fast K-Means') # let's prepare the data in small chunks mbkmeans = MiniBatchKMeans(init='k-means++', n_clusters=10, batch_size=chunk) tstart = time() mbkmeans.fit(data) delta = time() - tstart print("Speed: %0.3fs" % delta) print("Inertia: %f" % mbkmeans.inertia_) print() print() results['MiniBatchKMeans Speed'].append(delta) results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) return results
def perform(): #imagesHandler.load_images() #colourHandler.extract_colour_distribution_from_all_images("RGB") RGB_data = colourHandler.getColourDistForAllImages("RGB") RGB_data = np.array(RGB_data, dtype=None) RGB_data = np.delete(RGB_data, 0, 1) LAB_data = colourHandler.getColourDistForAllImages("LAB") LAB_data = np.array(RGB_data, dtype=None) LAB_data = np.delete(RGB_data, 0, 1) gistVals = util.loadCSV("gistvals") gist_data = np.array(gistVals) #hogHandler.extract_hog_from_all_images() hog_data = hogHandler.getHogValsforAllImages() hog_data = np.array(hog_data, dtype=None) hog_data = np.delete(hog_data, 0, 1) hog_data = np.array(hog_data) #surfCodebook.run_codebook(n_clusters,400, 0.3, cv2.INTER_CUBIC, 0) surf_data = surf_cb_handler.get_distributions() surf_data = np.array(surf_data, dtype=None) surf_data = np.delete(surf_data, 0, 1) sift_data = sift_cb_handler.get_distributions() sift_data = np.array(sift_data, dtype=None) sift_data = np.delete(sift_data, 0, 1) orb_data = orb_cb_handler.get_distributions() orb_data = np.array(surf_data, dtype=None) orb_data = np.delete(surf_data, 0, 1) est = KMeans(n_clusters=30) print(79 * '_') print( '% 9s' % 'init' ' time inertia h**o compl v-meas ARI AMI silhouette') bench_k_means(est, "colourPerfomanceVmeta", colour_data) bench_k_means(est, "hogPerfomanceVmeta", hog_data)
def plot_network(n_clusters, subset_job, subset_edu, no_jobs, no_edu): plt.figure(figsize=(6, 8)) job_kmeans = KMeans(n_clusters=n_clusters) job_predict = job_kmeans.fit_predict(subset_job) empl_edu_kmean = KMeans(n_clusters=n_clusters) empl_predict = empl_edu_kmean.fit_predict(subset_edu) cluster_sum_jobs, cluster_sum_employ_edu = [], [] for i in range(n_clusters): cluster_sum_employ_edu.append( sum_cluster(empl_predict, i, no_edu) / sum(no_edu)) cluster_sum_jobs.append( sum_cluster(job_predict, i, no_jobs) / sum(no_jobs)) jobs_centres = job_kmeans.cluster_centers_ emp_edu_centres = empl_edu_kmean.cluster_centers_ result, all_coords = min_span_tree(jobs_centres, emp_edu_centres, cluster_sum_jobs, cluster_sum_employ_edu) city_labels() plot_california_counties() plot_california() for i in range(len(result)): for j in range(len(result[i])): if result[i][j] == 0: # NO LINK continue plt.scatter(jobs_centres[i][0] if i < n_clusters else emp_edu_centres[i - n_clusters][0], jobs_centres[i][1] if i < n_clusters else emp_edu_centres[i - n_clusters][1], edgecolors='b', facecolors='none') plt.scatter(jobs_centres[j][0] if j < n_clusters else emp_edu_centres[j - n_clusters][0], jobs_centres[j][1] if j < n_clusters else emp_edu_centres[j - n_clusters][1], edgecolors='b', facecolor='none') plt.plot( (jobs_centres[i][0] if i < n_clusters else emp_edu_centres[i - n_clusters][0], jobs_centres[j][0] if j < n_clusters else emp_edu_centres[j - n_clusters][0]), (jobs_centres[i][1] if i < n_clusters else emp_edu_centres[i - n_clusters][1], jobs_centres[j][1] if j < n_clusters else emp_edu_centres[j - n_clusters][1]), 'b-') plt.show()
def group_by_proximity(self, k=10): if len(self.points) == 0: return {} X = numpy.array([[p.lat, p.lon] for p in self.points]) distance_matrix = distance.squareform(distance.pdist(X)) db = KMeans(n_clusters=k).fit(distance_matrix) # re-attach ids grouped_points = {} for i, k in enumerate(db.labels_): logger.debug('idx, label [%s, %s]', i, k) if k not in grouped_points: grouped_points[k] = [] point = self.points[i] grouped_points[k].append({'id': point.uid, 'lat': point.lat, 'lon': point.lon}) logger.info('Finished grouping into %d groups.', len(grouped_points)) return grouped_points
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger); adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict(feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
import numpy import os from sklearn.cluster.k_means_ import KMeans import cPickle import sys # Performs K-means clustering and save the model to a local file if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: {0} sift_file cluster_num output_file".format(sys.argv[0]) print "sift_file -- path to the sift file" print "cluster_num -- number of cluster" print "output_file -- path to save the k-means model" exit(1) sift_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) # Read data X = numpy.genfromtxt(sift_file, delimiter=";") # Fit model estimator = KMeans(n_clusters=cluster_num) estimator.fit(X) # Dump model with open(output_file, "wb") as f: cPickle.dump(estimator, f) print "K-means trained successfully!"
from sklearn.cluster.k_means_ import KMeans from sklearn.datasets.samples_generator import make_blobs np.random.seed(0) batch_size = 45 centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=0.4) # Draw randoms indxs = np.arange(1000) np.random.shuffle(indxs) centroids = X[indxs[:3]] k_means = KMeans(k=3, max_iter=1, init=centroids) k_means.fit(X) k_means_labels1 = k_means.labels_ k_means_cluster_centers1 = k_means.cluster_centers_ k_means = KMeans(k=3, max_iter=2, init=centroids) k_means.fit(X) k_means_labels2 = k_means.labels_ k_means_cluster_centers2 = k_means.cluster_centers_ k_means = KMeans(k=3, max_iter=3, init=centroids) k_means.fit(X) k_means_labels3 = k_means.labels_ k_means_cluster_centers3 = k_means.cluster_centers_ k_means = KMeans(k=3, max_iter=4, init=centroids)
def main(): """CONFIGURATION""" num_clusters = 5; #Number of clusters random = False #If true, it will randomly assign clusters to the states w/ equal prob. If false, it will actually computer the clusters. working_dir = "/home/jmaxk/proj/geoloc/cluster/fb1/" #The input working_dir, which has 1 file per class. Each file contains the results of the linguistic ethnography tool """END CONFIGURATION""" if random: saveFiles = getSaveFiles(working_dir + 'results/random') else: saveFiles = getSaveFiles(working_dir + 'results/real') clusterFile = saveFiles[0] mapFile = saveFiles[1] featureIndeces = dict() classIndeces = [] counter =0 vecs = [] #Turn each file into a vector to be clustered. Note for root, dirs, files in os.walk(working_dir): for f in files: fullpath = os.path.join(root, f) if os.path.splitext(fullpath)[1] == '.txt': with open(fullpath) as fp: lines = fp.readlines() vec = [0.0]*(len(lines) + 1) for line in lines: featVals = line.split(' ') key = featVals[0] val = featVals[1] if not featureIndeces.has_key(key): featureIndeces[key] = counter counter = counter + 1 index = featureIndeces.get(key); vec[index] = float(val) vecs.append(vec) abbr = os.path.basename(fullpath).split(".")[0] #we only want to save actual states if (us.states.lookup(abbr) != None): st = (str(us.states.lookup(abbr).name)) classIndeces.append(st) #transform data into numpy array mylist = [] for item in vecs: mylist.append(numpy.array(item)) data = numpy.array(mylist) #cluster with kmeans, and save the clusters km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=10, verbose=False) raw_results = km.fit_predict(data) results = dict(zip(classIndeces, raw_results)) saveClusters(data,km, clusterFile) #this doesn't working_dir with random # save the map if random: random_results = dict() for key in results: random_results[key] = randint(0,5) colors = genColors(random_results) saveMap(random_results,colors, mapFile) else: colors = genColors(results) saveMap(results,colors, mapFile)