def answer(test_path): import warnings warnings.filterwarnings("ignore") import time t0 = time.time() from learning import process_test_data, training_data, training_answers from sklearn.cluster.k_means_ import KMeans from sklearn.linear_model.logistic import LogisticRegression test_data = process_test_data(test_path) km = KMeans() km.fit(training_data, training_answers) myNum = km.predict(test_data).item() numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2] numbers = [[num] for num in numX] letX = [ 'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o', 'o', 'o', 'a', 'a', 'o', 'a' ] letters = [[letter] for letter in letX] lr = LogisticRegression() lr.fit(numbers, letters) ans = lr.predict(myNum).item() t1 = time.time() return [ans, t1 - t0]
def k_way_spectral_clustering(): x = np.load('q2data.npy') A = np.load('AMatrix.npy') WeightMatrix = np.zeros((16, 16)) for i in range(16): for j in range(16): if A[i][j] == 1: WeightMatrix[i][j] = np.exp(-1 * ((np.linalg.norm(x[i] - x[j]) ** 2))) else: WeightMatrix[i][j] = 0 DegreeMatrix = np.sum(WeightMatrix, axis=1) L = DegreeMatrix - WeightMatrix DSquareRoot = np.diag(1.0 / (DegreeMatrix ** (0.5))) Lnorm = np.dot(np.dot(DSquareRoot, L), DSquareRoot) eigvals, eigvecs = np.linalg.eig(Lnorm) eigvecs = np.array(eigvecs, dtype=np.float64) sortedinds = eigvals.argsort() eigvec1, eigvec2, eigvec3, eigvec4 = eigvecs[:, 10], eigvecs[:, 11], eigvecs[:, 13], eigvecs[:, 14] kmeans = KMeans(n_clusters=3, init='random') kmeans.fit(eigvecs) components = kmeans.labels_ return components
def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True): print "ClusterBalancing..." indexesPicked = [] obs1 = self.observations[indexesToPick] obs = normalize(obs1, axis=0) if len(indexesToPick) != 0: if kmeansFlag: if(len(indexesToPick) < self.numClusters): cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10) else: cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10) else: if(len(indexesToPick) < self.numClusters): cluster = spectral_clustering(n_clusters=len(obs), n_init=10) else: cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10) cluster.fit(obs) labels = cluster.labels_ whenToStop = max(2, stopCount) count = 0 while count != whenToStop: cluster_list = range(self.numClusters) index = 0 for j in labels: if j in cluster_list: indexesPicked.append(indexesToPick[index]) cluster_list.remove(j) count += 1 if count == whenToStop: break labels[index] = -1 if len(cluster_list) == 0: break index += 1 return indexesPicked
def train_k_means_by_step(n_clusters, init_cluster_centers, x_array, eps): # eps = 1e-4 # eps = 0.1 # eps = 100.0 # prev_sample = np.array(clf.cluster_centers_, np.float) prev_centers = init_cluster_centers clf = KMeans(init=prev_centers, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps, max_iter=1) # if isinstance(prev_centers, str): # prev_centers = clf.cluster_centers_ clf.fit(x_array) new_centers = clf.cluster_centers_ centers_list = [prev_centers, new_centers] args = [1] values = [clf.inertia_] while get_distance(prev_centers, new_centers) > eps: prev_centers = new_centers clf = KMeans(init=prev_centers, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps, max_iter=1).fit(x_array) new_centers = clf.cluster_centers_ args.append(len(args) + 1) values.append(clf.inertia_) centers_list.append(new_centers) # print "k = %s, len centers = %s" % (n_clusters, len(f_values)) return args, values, centers_list
def performKmeans(data,n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) orb_cb_handler.store_estimator(est) return est
def performKmeans(data,n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) labels = est.labels_ labels_np = np.array(labels) return labels,est
def start_algorithm(self): """ start clustering the stored tweets :return: list of clusters containing tweets """ vectors = self.vectorize_data() kmeans = KMeans(init='k-means++', n_clusters=self.cluster_amount, n_init=10) kmeans.fit(vectors) return self.cluster_tweet(kmeans.labels_)
def performKmeans(data, n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) labels = est.labels_ labels_np = np.array(labels) return labels, est
def evaluate_kmeans_unsupervised(data, nclusters, k_init=20): """ Clusters data with kmeans algorithm and then returns the cluster centroids :param data: Points that need to be clustered as a numpy array :param nclusters: Total number of clusters :param method_name: Name of the method from which the clustering space originates (only used for printing) :return: Formatted string containing metrics and method name, cluster centers """ kmeans = KMeans(n_clusters=nclusters, n_init=k_init) kmeans.fit(data) return kmeans.cluster_centers_
def evaluateKMeans(data, labels, nclusters, method_name): ''' Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers :param data: Points that need to be clustered as a numpy array :param labels: True labels for the given points :param nclusters: Total number of clusters :param method_name: Name of the method from which the clustering space originates (only used for printing) :return: Formatted string containing metrics and method name, cluster centers ''' kmeans = KMeans(n_clusters=nclusters, n_init=20) kmeans.fit(data) return getClusterMetricString(method_name, labels, kmeans.labels_), kmeans.cluster_centers_
def _centroids(n_clusters: int, points: List[List[float]]) -> List[List[float]]: """ Return n_clusters centroids of points """ k_means = KMeans(n_clusters=n_clusters) k_means.fit(points) closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, points) return list(map(list, np.array(points)[closest.tolist()]))
def extract_word_clusters(commentList, commentCount): brown_ic = wordnet_ic.ic('ic-brown.dat') a, corpus, global_synsets = extract_global_bag_of_words(commentList, True) similarity_dict = {} i = 0 t = len(global_synsets)**2 for syn_out in global_synsets: similarity_dict[syn_out] = {} for syn_in in global_synsets: if syn_in.pos() == syn_out.pos(): similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic) else: similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out)) if i % 10000 == 0: print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)' i += 1 tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] vectors = [np.array(tup[1]) for tup in tuples] # Rule of thumb n = sqrt(len(global_synsets)/2) print "Number of clusters", n km_model = KMeans(n_clusters=n) km_model.fit(vectors) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(tuples[idx][0]) pprint.pprint(dict(clustering), width=1) feature_vector = np.zeros([len(corpus),n]) for i,comment in enumerate(corpus): for w in comment: for key, clust in clustering.items(): if w in clust: feature_vector[i][key] += 1 if i % 1000 == 0: print i, 'comments processed' print feature_vector '''
def runKMeans(distance_matrix, nClusters, number_of_threads): km = KMeans(n_clusters=nClusters, max_iter=100, init='k-means++', precompute_distances=True, n_jobs=number_of_threads) km.fit(distance_matrix) labels = km.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noises = list(labels).count(-1) print('Number of clusters' + str(n_clusters)) print('Number of noises' + str(n_noises)) return list(labels)
def evaluate_k_means_raw(data, true_labels, n_clusters, k_init): """ Clusters data with K-Means algorithm and then returns clustering accuracy and NMI :param data: Points that need to be clustered as a numpy array :param true_labels: True labels for the given points :param n_clusters: Total number of clusters :return: ACC, NMI """ # https://github.com/Datamine/MNIST-K-Means-Clustering/blob/master/Kmeans.ipynb # http://johnloeber.com/docs/kmeans.html # Llyod's Algorithm for K-Means Clustering kmeans = KMeans(n_clusters=n_clusters, n_init=k_init) kmeans.fit(data) acc = cluster_acc(true_labels, kmeans.labels_) nmi = metrics.normalized_mutual_info_score(true_labels, kmeans.labels_) return acc, nmi
def train_k_means(n_clusters, init_type, x_array, y, eps, n_init): DIGIT_COUNT = 10 inertias = [] iterations = [] entropys = [] for i in range(n_init): # fill matrix by zero n_matrix = np.zeros((n_clusters, DIGIT_COUNT), dtype=np.int) if init_type == "random": init = "random" elif init_type == "k-away": init = get_k_away_centers(x_array, n_clusters) else: raise NotImplementedError clf = KMeans(init=init, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps) clf.fit(x_array) # Q value inertias.append(clf.inertia_) # iterations number iterations.append(clf.n_iter_) # labels for j in range(len(y)): digit = y[j] cluster = clf.labels_[j] n_matrix[cluster][digit] += 1 n = float(len(y)) # print "n_matrix = ", [v for v in n_matrix] Hyz = -reduce(lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0), [ n_matrix[cluster][digit] / n for cluster in range(n_clusters) for digit in range(DIGIT_COUNT) ], 0.0) Hz = -reduce( lambda s, p: s + (p * math.log(p, 2) if p > 0 else 0), [sum(n_matrix[cluster], 0.0) / n for cluster in range(n_clusters)], 0.0) # print("Hyz = %s" % Hyz) # print("Hz = %s" % Hz) entropys.append(Hyz - Hz) return iterations, inertias, entropys
def create_train_kmeans(data, number_of_clusters=len(codes)): # n_jobs is set to -1 to use all available CPU cores. This makes a big difference on an 8-core CPU # especially when the data size gets much bigger. #perfMatters k = KMeans(n_clusters=number_of_clusters, n_jobs=-1, random_state=728) # Let's do some timings to see how long it takes to train. start = time.time() # Train it up k.fit(data) # Stop the timing end = time.time() # And see how long that took print("Training took {} seconds".format(end - start)) return k
def test_cifar10(): (X_train, y_train), (X_test, y_test) = cifar10.load_data() X_train = X_train.reshape((50000, 32*32*3)) X_test = X_test.reshape((10000, 32*32*3)) y_train = y_train.reshape((50000)) y_test = y_test.reshape((10000)) distortions = [] X_ = X_test[y_test==4] K = range(1,30) for k in K: kmeanModel = KMeans(n_clusters=k, random_state=84) kmeanModel.fit(X_) distortions.append(sum(np.min(cdist(X_, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_.shape[0]) # Plot the elbow plt.plot(K, distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show() #alg = LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84) #alg.fit(X_train, y_train) #y_pred = alg.predict(X_test) #score = accuracy_score(y_test, y_pred) #print(score) pl = PluralizatorClassifier( LogisticRegressionCV(Cs=[1], multi_class='ovr', n_jobs=-1, random_state=84), 'k-means', { 0:3, 1:3, 2:3, 3:3, 4:3, 5:3, 6:3, 7:3, 8:3, 9:3 }, random_state=84, n_jobs=-1) pl.fit(X_train, y_train) y_pred = pl.predict(X_test) score = accuracy_score(y_test, y_pred) print(score) return
def run_kmeans(data,label,k=3,fname="../results/kmeans"): if len(data) < k: return vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,stop_words='english', use_idf=True) clean_data = get_clean_data(data) X = vectorizer.fit_transform(clean_data) km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) km.fit(X) print label,np.bincount(km.labels_) assert len(km.labels_) == len(data) f = open(fname+str(int(label))+".csv",'w') f.write("subject\tbody\tcluster_id\n") for i in range(len(data)): subject,body = data[i] subject = " ".join(str(subject).split()) body = " ".join(str(body).split()) cluster_id = str(km.labels_[i]) row = data[i] f.write(subject+"\t"+body+"\t"+cluster_id+'\n') f.close()
def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True): print "ClusterBalancing..." indexesPicked = [] obs1 = self.observations[indexesToPick] obs = normalize(obs1, axis=0) if len(indexesToPick) != 0: if kmeansFlag: if (len(indexesToPick) < self.numClusters): cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10) else: cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10) else: if (len(indexesToPick) < self.numClusters): cluster = spectral_clustering(n_clusters=len(obs), n_init=10) else: cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10) cluster.fit(obs) labels = cluster.labels_ whenToStop = max(2, stopCount) count = 0 while count != whenToStop: cluster_list = range(self.numClusters) index = 0 for j in labels: if j in cluster_list: indexesPicked.append(indexesToPick[index]) cluster_list.remove(j) count += 1 if count == whenToStop: break labels[index] = -1 if len(cluster_list) == 0: break index += 1 return indexesPicked
class KMeansImpl(): def __init__(self, n_clusters=8, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm='auto'): self._hyperparams = { 'n_clusters': n_clusters, 'init': init, 'n_init': n_init, 'max_iter': max_iter, 'tol': tol, 'precompute_distances': precompute_distances, 'verbose': verbose, 'random_state': random_state, 'copy_x': copy_x, 'n_jobs': n_jobs, 'algorithm': algorithm } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X) def predict(self, X): return self._wrapped_model.predict(X)
def run_kmeans(data, label, k=3, fname="../results/kmeans"): if len(data) < k: return vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, stop_words='english', use_idf=True) clean_data = get_clean_data(data) X = vectorizer.fit_transform(clean_data) km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1) km.fit(X) print label, np.bincount(km.labels_) assert len(km.labels_) == len(data) f = open(fname + str(int(label)) + ".csv", 'w') f.write("subject\tbody\tcluster_id\n") for i in range(len(data)): subject, body = data[i] subject = " ".join(str(subject).split()) body = " ".join(str(body).split()) cluster_id = str(km.labels_[i]) row = data[i] f.write(subject + "\t" + body + "\t" + cluster_id + '\n') f.close()
def get_data_for_kl_loss(self, encode_output, label_list, n_clusters): """ returns centroids for KL-divergence loss :param encode_output: encoder output :param label_list: labels for the encoder output :param n_clusters: number of clusters :return: centroids """ # if self.use_cuda is False: # data = np.copy(encode_output.data) # label = np.copy(label_list.data) # else: # data = np.copy(encode_output.data.cpu()) # label = np.copy(label_list.data.cpu()) data = encode_output data_len = len(data) if data_len < n_clusters: n_clusters = data_len kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=self.k_init) # Fitting the input data kmeans.fit(data) # Centroid values centroids = kmeans.cluster_centers_ if self.use_cuda: return Variable(torch.from_numpy(centroids).float().cuda()) return Variable(torch.from_numpy(centroids).float())
def train(self, X): clf = KMeans(self.n_cluters) s = clf.fit(X) return clf, s
y_train, cv=kfold, scoring="accuracy") results.append(cv_results) names.append(name) print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())) print('***********KNN**************') knn = KNeighborsClassifier() knn.fit(x_train, y_train) predictions = knn.predict(x_test) print(accuracy_score(y_test, predictions)) print(confusion_matrix(y_test, predictions)) print(classification_report(y_test, predictions)) print('***********PCA**************') pca = PCA() X_train1 = pca.fit_transform(x_train) X_validation1 = pca.transform(x_test) #print(pca.explained_variance_ratio_) #her bileşen için varyans değerlerni gösteriyor print('************************') print("\n\n", X_train1) print('*******KMEANS*************') kmeans = KMeans(n_clusters=5) kmeans.fit(X, Y) #print(kmeans.cluster_centers_) print(pd.crosstab(Y, kmeans.labels_))
else: tweet_topic.append(lda_model(text)) users_cluster.append(users) users = [] count = count + 1 #for new users users.append(cluster[2]) text = cluster[1] tweet_topic.append(lda_model(text)) return cluster_data, tweet_topic, users_cluster print('getting dataframe') data = get_dataframe() user_handle = list(data['user_handle']) print('getting final matrix') final_matrix, count, tweet_data = get_vector(data) joblib.dump(final_matrix, home + '/../../data/final_matrix.txt') print('Applying clustering') sse = dict() for k in range(1, 25): kmeans = KMeans(n_clusters=k, max_iter=100, random_state=0) clus = kmeans.fit(final_matrix) sse[k] = clus.inertia_ print('20 clusters are ready') cluster_data, tweet_topic, users_cluster = get_cluster_kmeans(count) joblib.dump(tweet_topic, home + '/../../data/tweet_topic.txt') joblib.dump(users_cluster, home + '/../../data/users_cluster.txt') joblib.dump(sse, home + '/../../data/sse.txt')
import numpy import os from sklearn.cluster.k_means_ import KMeans import cPickle import sys # Performs K-means clustering and save the model to a local file if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: {0} sift_file cluster_num output_file".format(sys.argv[0]) print "sift_file -- path to the sift file" print "cluster_num -- number of cluster" print "output_file -- path to save the k-means model" exit(1) sift_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) # Read data X = numpy.genfromtxt(sift_file, delimiter=";") # Fit model estimator = KMeans(n_clusters=cluster_num) estimator.fit(X) # Dump model with open(output_file, "wb") as f: cPickle.dump(estimator, f) print "K-means trained successfully!"
#!/bin/python import numpy import os from sklearn.cluster.k_means_ import KMeans import cPickle import sys import pandas as pd # Performs K-means clustering and save the model to a local file if __name__ == '__main__': if len(sys.argv) != 4: print "Usage: {0} mfcc_csv_file cluster_num output_file".format(sys.argv[0]) print "mfcc_csv_file -- path to the mfcc csv file" print "cluster_num -- number of cluster" print "output_file -- path to save the k-means model" exit(1) mfcc_csv_file = sys.argv[1]; output_file = sys.argv[3] cluster_num = int(sys.argv[2]) df = pd.read_csv(mfcc_csv_file, delimiter=';', header=None) kmeans = KMeans(n_clusters=cluster_num, max_iter=300, verbose=1, n_jobs=-1) model = kmeans.fit(df) cPickle.dump(model, open(output_file, "wb")) print "K-means trained successfully!"
from sklearn.datasets.samples_generator import make_blobs np.random.seed(0) batch_size = 45 centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std=0.4) # Draw randoms indxs = np.arange(1000) np.random.shuffle(indxs) centroids = X[indxs[:3]] k_means = KMeans(k=3, max_iter=1, init=centroids) k_means.fit(X) k_means_labels1 = k_means.labels_ k_means_cluster_centers1 = k_means.cluster_centers_ k_means = KMeans(k=3, max_iter=2, init=centroids) k_means.fit(X) k_means_labels2 = k_means.labels_ k_means_cluster_centers2 = k_means.cluster_centers_ k_means = KMeans(k=3, max_iter=3, init=centroids) k_means.fit(X) k_means_labels3 = k_means.labels_ k_means_cluster_centers3 = k_means.cluster_centers_ k_means = KMeans(k=3, max_iter=4, init=centroids) k_means.fit(X)
from sklearn.cluster.k_means_ import KMeans import _pickle as cPickle import sys # Performs K-means clustering and save the model to a local file if __name__ == '__main__': if len(sys.argv) != 4: print("Usage: {0} mfcc_csv_file cluster_num output_file".format( sys.argv[0])) print("mfcc_csv_file -- path to the mfcc csv file") print("cluster_num -- number of cluster") print("output_file -- path to save the k-means model") exit(1) mfcc_csv_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) #data = numpy.genfromtxt(mfcc_csv_file, delimiter=";") #data = numpy.loadtxt(mfcc_csv_file, delimiter=";") #data = pandas.io.parsers.read_csv(mfcc_csv_file, sep=";") data = pandas.read_csv(mfcc_csv_file, sep=';') #data = numpy.genfromtxt(mfcc_csv_file,dtype=numpy.float64, delimiter=";") model = KMeans(n_clusters=cluster_num, n_jobs=5) model.fit(data) cPickle.dump(model, open(output_file, 'wb')) print("K-means trained successfully!")
if __name__ == '__main__': if len(sys.argv) != 4: print("Usage: {0} mfcc_csv_file cluster_num output_file".format( sys.argv[0])) print("mfcc_csv_file -- path to the mfcc csv file") print("cluster_num -- number of cluster") print("output_file -- path to save the k-means model") exit(1) # read cmd lime args mfcc_csv_file = sys.argv[1] cluster_num = int(sys.argv[2]) output_file = sys.argv[3] # load mfcc features mfcc_features = numpy.genfromtxt(mfcc_csv_file, dtype=numpy.float32, delimiter=";") # create and execute k-means clustering km_model = KMeans(n_clusters=cluster_num, n_jobs=2) km_model.fit(mfcc_features) print("K-means trained successfully!") # save model out_fd = open(output_file, "wb") cPickle.dump(km_model, out_fd) # cPickle.HIGHEST_PROTOCOL needed? out_fd.close() print("K-means saved successfully!")
# Performs K-means clustering and save the model to a local file if __name__ == '__main__': if len(sys.argv) != 4: print "Usage: {0} mfcc_csv_file cluster_num output_file".format( sys.argv[0]) print "mfcc_csv_file -- path to the mfcc csv file" print "cluster_num -- number of cluster" print "output_file -- path to save the k-means model" exit(1) mfcc_csv_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) # kmeans code here ===== #mfcc_array = np.loadtxt(mfcc_csv_file, delimiter = ';', dtype = 'float64') mfcc_array = pd.read_csv(mfcc_csv_file, sep=';', header=None, dtype='float64') kmeans = KMeans(n_clusters=cluster_num) kmeans.fit(mfcc_array) # with open(output_file,'wb') as fp: # cPickle.dump(kmeans,fp) cPickle.dump(kmeans, open(output_file, "wb")) print "K-means trained successfully!"
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger); adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict(feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)
v0, v1, arrowprops=arrowprops, ) plt.text(v1[0], v1[1], "PC2") # Eksenleri eşit göstermemiz önemli !!! # Yoksa PC'lerin birbirine dik olduğunu şekilden anlayamazsınız !!! plt.axis("equal") plt.figure() plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.2) # ilk eksende PC1 olduğu için plt.scatter(X_new[:, 0], np.zeros(X_new[:, 0].shape), alpha=0.1, color='r') # Projeksiyon çizgilerini de çizdirelim for i in range(len(X_new[:, 0])): plt.plot([X_new[i, 0], X_new[i, 0]], [X_new[i, 1], 0], color="deeppink", alpha=0.1) plt.xlabel("PC1") plt.ylabel("PC2") plt.axis("equal") from sklearn.cluster.k_means_ import KMeans kmeans = KMeans(n_clusters=3) kmeans.fit(x, y) print(kmeans.cluster_centers_) print(pd.crosstab(y, kmeans.labels_))
import numpy as np import os from sklearn.cluster.k_means_ import KMeans import cPickle import sys import pickle # Performs K-means clustering and save the model to a local file if __name__ == '__main__': if len(sys.argv) != 4: print "Usage: {0} mfcc_csv_file cluster_num output_file".format( sys.argv[0]) print "mfcc_csv_file -- path to the mfcc csv file" print "cluster_num -- number of cluster" print "output_file -- path to save the k-means model" exit(1) mfcc_csv_file = sys.argv[1] output_file = sys.argv[3] cluster_num = int(sys.argv[2]) mfcc_vectors = np.genfromtxt(mfcc_csv_file, delimiter=";") kmeans_model = KMeans(n_clusters=cluster_num, init='k-means++', n_init=10, verbose=1) kmeans_model.fit(mfcc_vectors) pickle.dump(kmeans_model, open(output_file + '.pickle', 'wb')) print "K-means trained successfully!"
class KMeansEstimator: """ This class reads the tweets of users from a file and builds cluster centers on that data. It also provides method for finding the closest cluster center of unseen data. """ ADJECTIVE = 'JJ' """ Feature keys used in clustering... """ POLARITY_FEATURE_KEY = 'polarity' SUBJECTIVITY_FEATURE_KEY = 'subjectivity' TWEET_COUNT_FEATURE_KEY = 'tweetCount' """ Features not considered for clustering... """ USER_ID_FEATURE_KEY = 'userId' LONGITUDE_FEATURE_KEY = 'longitude' LATITUDE_FEATURE_KEY = 'latitude' """ Predicted label feature name. """ LABEL_FEATURE_KEY = 'label' RELEVENT_FEATURE_LIST = [ USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY ] def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters) @time_it def __get_data_matrix_from_file(self, tweet_file_path): """ Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list of all feature vectors. """ file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',') next(file_reader) data_matrix = [] for row in file_reader: logging.info("Extracting features for user_id:%s", row[0]) feature_vector = {} feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0]) feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2]) feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3]) feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4]) feature_vector.update( self.__get_features_from_tweet_text(row[1].decode('utf-8'))) data_matrix.append(feature_vector) logging.info("Successfully extracted features for user_id:%s", row[0]) return data_matrix @time_it def __get_features_from_tweet_text(self, tweet_text): """This function returns the following features from the tweet text: - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature. - Subjectivity and polarity as determined by TextBlob. :returns: (key,value) map of all features found. """ text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger) adjective_map = dict( Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE))) polarity = text_blob.sentiment[0] subjectivity = text_blob.sentiment[1] return dict( adjective_map.items() + { self.POLARITY_FEATURE_KEY: polarity, self.SUBJECTIVITY_FEATURE_KEY: subjectivity }.items()) @time_it def __get_clustering_data_matrix(self, data_matrix): """ This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from the data matrix and returns a copy of the data matrix. """ data_matrix_copy = copy.deepcopy(data_matrix) for feature_vector in data_matrix_copy: feature_vector.pop(self.USER_ID_FEATURE_KEY) feature_vector.pop(self.LATITUDE_FEATURE_KEY) feature_vector.pop(self.LONGITUDE_FEATURE_KEY) return data_matrix_copy @time_it def perform_clustering(self, features_to_include=None): """ This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at "tweet_file_path". It returns list of feature vector, where each feature vector contains only "features_to_include" or all features if "features_to_include" is None. """ clustering_data_matrix = self.__get_clustering_data_matrix( self.data_matrix) transformed_data_matrix = self.vectorizer.fit_transform( clustering_data_matrix) self.k_means_estimator.fit(transformed_data_matrix, y=None) return self.__get_predicted_labels(self.data_matrix, features_to_include) @time_it def __get_predicted_labels(self, data_matrix, features_to_include): """ Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The data matrix is modified in place. It returns a new copy of data_matrix with "features_to_include" features. """ feature_names = self.vectorizer.get_feature_names() for feature_vector in data_matrix: row = [0] * len(feature_names) column = range(len(feature_names)) data = map( lambda feature_name: feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names) feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column)))) predicted_label = self.k_means_estimator.predict( feature_csr_matrix) feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0] expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix) if features_to_include: return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include) else: return expanded_data_matrix @time_it def __get_filtered_data_matrix(self, data_matrix, features_to_include): """ Removes all features except features_to_include """ filtered_data_matrix = [] for feature_vector in data_matrix: filtered_feature_vector = {} for feature_name in features_to_include: filtered_feature_vector[feature_name] = feature_vector[ feature_name] filtered_data_matrix.append(filtered_feature_vector) return filtered_data_matrix @time_it def __get_expanded_data_matrix(self, data_matrix): """ Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new modified copy is returned. """ feature_names = self.vectorizer.get_feature_names() expanded_data_matrix = copy.deepcopy(data_matrix) for feature_vector in expanded_data_matrix: for feature_name in feature_names: if feature_name not in feature_vector: feature_vector[feature_name] = 0 return expanded_data_matrix @time_it def predict_labels_for_data(self, file_path, features_to_include=None): """ This function reads the tweets of different users from the file at file_path and assigns the closest cluster center to each user. It returns list of tuples of (user_id,predicted_label,latitude, longitude). """ data_matrix = self.__get_data_matrix_from_file(file_path) return self.__get_predicted_labels(data_matrix, features_to_include)