def __init__(self, k=7) -> None: self.k = k self.distance = nltk.cluster.cosine_distance self.model = KMeansClusterer(self.k, self.distance, avoid_empty_clusters=True)
def spectral_clustering(A, nb_clusters, laplacian_normalization = None, algo = None): """ Compute the clusters assignement from spectral clustering algorithm steps : * Compute laplacian * Compute k smaller eigenvalues and associated eigenvectors * Train a kmean on this vectors * Apply this kmean to the Laplacian """ if algo not in ['sph', None]: raise Exception('Algorithm {} unknown'.format(algo)) L = get_laplacian(A, laplacian_normalization) L = scipy.sparse.csr_matrix(L, dtype=np.float64) v, w = eigsh(L, nb_clusters, which='SM') if algo == None : km = KMeans(n_clusters= nb_clusters) km.fit(np.transpose(w)) clusters = km.predict(L) elif algo == 'sph': clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) cluster = clusterer.cluster(np.transpose(w), True) vectors = [np.transpose(L[i, :].toarray()[0]) for i in range(0, L.shape[1])] clusters = [clusterer.classify(vector) for vector in vectors] return clusters
def nltk_clustering(n, filename): global vectors global names global repeats # Clustering print("Begin clustering, n = {:d}...".format(n)) clusterer = KMeansClusterer(n, cosine_distance, repeats=repeats) clustered = clusterer.cluster(vectors, assign_clusters=True, trace=False) clustered = np.array(clustered) index = sorted(clustered) # print(clustered.argsort()) names = list(names[clustered.argsort()]) # write result to file print("Saving result to file...") output = filename[:-4] + "_" + str(n) + "_clustered.txt" with open(output, "w") as f: current_idx = None for itr, idx in zip(names, index): if current_idx != idx: current_idx = idx f.write("\nCluster {:d} (description: )\n".format(current_idx)) else: pass f.write(itr + "\n") # print("Clustered result saved in {0}".format(output))
def clusterer_nltk_kmeans(X, n_clusters): # "_args": [{"type": "numpy.ndarray","dtype": "float32"} ], # "_return": [{ "type": "numpy.ndarray","dtype": "int32"} # in this case we want to try different numbers of clusters, so it is a parameter import nltk import numpy as np from nltk.cluster.kmeans import KMeansClusterer print('clusterer_nltk_kmeans') clusterAlgLabelAssignmentsNK = None # X = XY[0] cmtVectors = X # XY[1] if type(cmtVectors) is np.ndarray and len(cmtVectors) > 0: # dt = np.dtype(cmtVectors) dt = cmtVectors.dtype if dt.type is np.float32 or dt.type is np.float64: clusterAlgNK = KMeansClusterer( params['n_clusters'], distance=nltk.cluster.util.cosine_distance, repeats=25, avoid_empty_clusters=True) clusterAlgLabelAssignmentsNK = clusterAlgNK.cluster( cmtVectors, assign_clusters=True) XY = (X, clusterAlgLabelAssignmentsNK) return XY
def nltk_manhattan_kmeans(encoded_img): from scipy.spatial.distance import cityblock from nltk.cluster.kmeans import KMeansClusterer kclusterer = KMeansClusterer(2, distance=cityblock, repeats=10) assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True) print_labels(assigned_clusters)
def nltk_euclidean_kmeans(encoded_img): from nltk.cluster.util import euclidean_distance from nltk.cluster.kmeans import KMeansClusterer kclusterer = KMeansClusterer(2, distance=euclidean_distance, repeats=10) assigned_clusters = kclusterer.cluster(encoded_img, assign_clusters=True) print_labels(assigned_clusters)
def spherical_clustering_from_adjency(A, nb_clusters): """ Spectral clustering with spherical kmeans """ A = scipy.sparse.csr_matrix(A, dtype=np.float64) v, w = eigsh(A, nb_clusters, which='LM') clusterer = KMeansClusterer(nb_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) cluster = clusterer.cluster(np.transpose(w), True) vectors = [np.transpose(A[i, :].toarray()[0]) for i in range(0, A.shape[1])] clusters = [clusterer.classify(vector) for vector in vectors] return clusters
def new_cluster(filepath): NUM_CLUSTERS = 4 data = get_data(filepath) kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=lambda a, b: np.max(a - b), repeats=1000) labels = kclusterer.cluster(data, assign_clusters=True) print("Showing the cluster results") for id in range(NUM_CLUSTERS): for i in range(len(data)): if labels[i] == id: print("Joint : ", i + 1, " Joint Values: ", data[i], " Cluster Id: ", id)
def Kmeans(self, volcabulary, vectors, n_cluster): """K-means clustering based on cosine similarity of word2vec. """ kclusterer = KMeansClusterer( n_cluster, distance=nltk.cluster.util.cosine_distance, repeats=10, avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) dic = defaultdict(list) for c, w in zip(assigned_clusters, volcabulary): dic[c].append(w) return assigned_clusters, dic
def ClusterItems(data_file, items_bias_file, index_file, clusters_file, centroids_file): data = np.genfromtxt(data_file) popular_items = np.genfromtxt(index_file).astype('int') data = data[popular_items] items_bias = np.genfromtxt(items_bias_file) important_items = np.where(np.abs(items_bias[popular_items]) < 0.2)[0] kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=cosine_distance) print(NUM_CLUSTERS, important_items.shape) print("end", data.shape) clusters = kclusterer.cluster(data[important_items], assign_clusters=True) np.savetxt(centroids_file, kclusterer.means()) np.savetxt(clusters_file, clusters)
def main(): getFiles() tf_idf() num_clusters = int(sys.argv[2]) kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(wordvec, assign_clusters=True) clustersDict = {} for i in range(num_clusters): clustersDict[i] = [] for i in range(len(assigned_clusters)): clustersDict[assigned_clusters[i]].append(fileList[i]) printClustersInFormat(clustersDict)
def cluster(clusterType, vectors, y): if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "GMM"): GMM = GaussianMixture(n_components=NUM_CLUSTERS) assigned_clusters = GMM.fit_predict(vectors) elif (clusterType == "SVM"): classifier = SVC(kernel='rbf', gamma='auto', random_state=0) #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) elif (clusterType == "T2VH"): ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS) children = ret[0] n_leaves = ret[2] assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children, n_leaves) elif (clusterType == "RandomForest"): classifier = RandomForestClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "DecisionTree"): classifier = DecisionTreeClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "LogisticRegression"): classifier = sklearn.linear_model.LogisticRegression() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) else: print(clusterType, " is not a predefined cluster type.") return return assigned_clusters
def cluster(folderName, vectorsize, clusterType): corpus = loadXES.get_doc_XES_tagged(folderName + '.xes') print('Data Loading finished, ', str(len(corpus)), ' traces found.') model = gensim.models.Doc2Vec.load('output/' + folderName + 'T2VVS' + str(vectorsize) + '.model') vectors = [] NUM_CLUSTERS = 5 print("inferring vectors") for doc_id in range(len(corpus)): inferred_vector = model.infer_vector(corpus[doc_id].words) vectors.append(inferred_vector) print("done") if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "HierWard"): ward = AgglomerativeClustering(n_clusters=NUM_CLUSTERS, linkage='ward').fit(vectors) assigned_clusters = ward.labels_ elif clusterType == "OCSVM": ocsvm = OneClassSVM() assigned_clusters = ocsvm.fit_predict(vectors) else: print( clusterType, " is not a predefined cluster type. Please use 'KMeans' or 'HierWard', or create a definition for ", clusterType) return trace_list = loadXES.get_trace_names(folderName + ".xes") clusterResult = {} for doc_id in range(len(corpus)): clusterResult[trace_list[doc_id]] = assigned_clusters[doc_id] resultFile = open( 'output/' + folderName + 'T2VVS' + str(vectorsize) + clusterType + '.csv', 'w') for doc_id in range(len(corpus)): resultFile.write(trace_list[doc_id] + ',' + str(assigned_clusters[doc_id]) + "\n") resultFile.close() print("done with ", clusterType, " on event log ", folderName)
class ClusteringPairwise(): def __init__(self, users_vecs_train_file, centroid_file, clustering_file, num_clusters, n_iteration, mode): self.mode = mode self.num_clusters = num_clusters self.users = np.genfromtxt(users_vecs_train_file) self.tree = LasyTree(np.arange(self.users.shape[0])) self.centroids = np.genfromtxt(centroid_file) clusters_ = np.genfromtxt(clustering_file).astype('int') self.clusters = {} for i in range(num_clusters): self.clusters[i] = [] for i in range(len(clusters_)): self.clusters[clusters_[i]].append(i) self.n_iteration = n_iteration self.kclusterer = KMeansClusterer(num_clusters, distance=cosine_distance, initial_means=list(self.centroids)) def RecieveQuestions(self, item_vecs, user, user_estim, n_points, item_bias, ratings): clusters_ = [self.kclusterer.classify(item) for item in item_vecs] clusters = {} for i in range(self.num_clusters): clusters[i] = [] for i in range(len(clusters_)): clusters[clusters_[i]].append(i) a = np.argsort(clusters_) return AllAlgorithm(self.users, self.n_iteration, self.centroids, item_vecs, item_bias, user, clusters, self.tree, self.mode, ratings)
def clustering(dataframe, repeats, myStopwords): num_clusters = 5 # define vectorizer parameters tfidf_vectorizer = TfidfVectorizer(stop_words=myStopwords) # Only process the content, not the title tfidf_matrix = tfidf_vectorizer.fit_transform(dataframe["Content"]) # Convert it to an array tfidf_matrix_array = tfidf_matrix.toarray() # Run K-means with cosine distance as the metric kclusterer = KMeansClusterer(num_clusters, distance=cosine_distance, repeats=repeats) # Output to assigned_clusters assigned_clusters = kclusterer.cluster(tfidf_matrix_array, assign_clusters=True) # cluster_size counts how many elements each cluster contains cluster_size = [0, 0, 0, 0, 0] # Create a 5x5 array and fill it with zeros matrix = [[0 for x in range(5)] for y in range(5)] # For every category for category in categories: # For every article for row in range(0, len(assigned_clusters)): # Compare the cluster number with the category number if assigned_clusters[row] == categories.index(category): ind = categories.index(dataframe.ix[row][4]) matrix[categories.index(category)][ind] += 1 # Count how many elements each cluster contains for row in range(0, len(assigned_clusters)): cluster_size[assigned_clusters[row]] += 1 for x in range(5): for y in range(5): # Calculate frequency matrix[x][y] /= cluster_size[x] # Only keep the 2 first decimal digits matrix[x][y] = format(matrix[x][y], '.2f') # Output to a .csv file out_file = open("output/clustering_KMeans.csv", 'w') wr = csv.writer(out_file, delimiter="\t") newCategories = categories newCategories.insert(0, "\t") wr.writerow(newCategories) for x in range(5): newMatrix = matrix[x] clusterName = "Cluster " + str(x + 1) newMatrix.insert(0, clusterName) wr.writerow(matrix[x])
def __init__(self, users_vecs_train_file, centroid_file, clustering_file, num_clusters, n_iteration, mode): self.mode = mode self.num_clusters = num_clusters self.users = np.genfromtxt(users_vecs_train_file) self.tree = LasyTree(np.arange(self.users.shape[0])) self.centroids = np.genfromtxt(centroid_file) clusters_ = np.genfromtxt(clustering_file).astype('int') self.clusters = {} for i in range(num_clusters): self.clusters[i] = [] for i in range(len(clusters_)): self.clusters[clusters_[i]].append(i) self.n_iteration = n_iteration self.kclusterer = KMeansClusterer(num_clusters, distance=cosine_distance, initial_means=list(self.centroids))
def recluster(df, cl, clusters, n_clusters): lbls = cl.labels_ mask = np.array([False for i in range(len(lbls))]) for c in clusters: mask |= lbls==c subpipe, results = data_pipeline(df[mask]) ##use cosine similarity! NLTK clustering implementation #KMeans cluster object as carrier for consistency subcl = cluster(results, n_clusters) kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50) assigned_clusters = kclusterer.cluster(results, assign_clusters=True) #assign new cluster labels and cluster centroids subcl.labels_ = np.array(assigned_clusters) subcl.cluster_centers_ = np.array(kclusterer.means()) return subpipe, subcl, results, df[mask]
def cluster_docs(self): vectors = [] used_lines = [] for doc, id in self.es_docs(): tokens = text_cleaner.clean_tokens(doc) if tokens != 'NC' and len(tokens) > 200: used_lines.append(tokens) vectors.append(self.model.infer_vector(tokens)) kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) print("done")
class kmeans_cosine(object): def __init__(self,k): self.k = k self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25) def build(self,X,p): """ """ data = scipy.sparse.csr_matrix(X).toarray() kclusters= np.array(self.model.cluster(data, assign_clusters=True)) prediction = self.model.classify(p) cluster_id = kclusters == prediction return cluster_id, prediction def save(self, filename = "model2.pkl"): """ """ with open(filename, 'w') as f: pickle.dump(self.model, f)
def get_cluster(tfidf_arr, k): """ K-means聚类 :param tfidf_arr: :param k: :return: """ kmeans = KMeansClusterer(num_means=k, distance=cosine_distance, avoid_empty_clusters=True) # 分成k类,使用余弦相似分析 kmeans.cluster(tfidf_arr) # 获取分类 kinds = pd.Series([kmeans.classify(i) for i in tfidf_arr]) fw = open('/you_filed_algos/prod_kudu_data/ClusterText.txt', 'a+', encoding='utf-8') for i, v in kinds.items(): fw.write(str(i) + '\t' + str(v) + '\n') fw.close()
def __init__(self, model): """ @param model: (type=Word2Vec model) """ self.model = model #store the Word2Vec model object in case of future use self.word_to_vec = {word:model.wv[word] for word in model.wv.vocab} #mapping from word strings to vectors self.vectors = [model.wv[word] for word in model.wv.vocab] clusterer = KMeansClusterer(num_means=5, distance=cosine_distance) #the object that will cluster our vectors, num_means will eventually be parameterized clusterer.cluster_vectorspace(self.vectors) self.central_words = [] #find closest words to centroids for centroid in clusterer._means: closest = None for word in self.word_to_vec: vector = self.word_to_vec[word] if not closest or (cosine_distance(vector, centroid) < cosine_distance(closest[1], centroid)): closest = (word, self.word_to_vec[word]) self.central_words.append(closest) self.centroids = clusterer._means
class KMeansClusters(BaseEstimator, TransformerMixin): def __init__(self, k=7) -> None: self.k = k self.distance = nltk.cluster.cosine_distance self.model = KMeansClusterer(self.k, self.distance, avoid_empty_clusters=True) def fit(self, data, labels=None): return self def transform(self, data): return self.model.cluster(data, assign_clusters=True)
def __init__(self,k): self.k = k self.model = KMeansClusterer(k, distance=nltk.cluster.util.cosine_distance, repeats=25)
def bbox_iou(x1, y1, w1, h1, x2, y2, w2, h2): x_a = torch.max(x1 - w1 / 2.0, x2 - w2 / 2.0) y_a = torch.max(y1 - h1 / 2.0, y2 - h2 / 2.0) x_b = torch.min(x1 + h1 / 2.0, x2 + w2 / 2.0) y_b = torch.max(y1 + h1 / 2.0, y2 + h2 / 2.0) intersection = torch.clamp(x_b - x_a, min=0) * torch.clamp(y_b - y_a, min=0) union = w1 * h1 + w2 * h2 - intersection return intersection / (union + 1e-6) kclusterer = KMeansClusterer(args.num_bbox, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(data, assign_clusters=True) kmeans_wh = KMeans(n_clusters=args.num_bbox) kmeans_wh.fit(train_wh) bbox_priors = kmeans_wh.cluster_centers_ np.save('priors.npy', bbox_priors) bbox_priors = torch.from_numpy(bbox_priors).cuda() # Set up the network features = DenseNet(growth_rate=8, block_config=(4, 8, 16, 32), activation=nn.LeakyReLU(inplace=True), input_channels=3)
phrase = ii[1] if score < 0.7: break try: arr = numpy.append(arr, numpy.reshape(model.wv.word_vec(phrase), (1, 100)), axis=0) except KeyError: pass else: embedded_phrases.append(phrase) print('number of sample points:', len(embedded_phrases)) kmeans = KMeansClusterer(6, nltk.cluster.util.cosine_distance) clusters = kmeans.cluster(arr, assign_clusters=True) centers = kmeans.means() result = {0: [], 1: [], 2: [], 3: [], 4: [], 5: []} for i in range(len(clusters)): result[clusters[i]].append([ nltk.cluster.util.cosine_distance(centers[clusters[i]], arr[i]), embedded_phrases[i] ]) for k in result: sorted_result = sorted(result[k], reverse=True) final_result = '\n'.join( ['%.10f' % x[0] + '\t' + x[1] for x in sorted_result]) f = open('cluster' + str(k) + '.txt', 'w+') f.write(final_result)
lines = open(datacfg).readlines() images = [] for line in lines: if (line.split(' ')[0] == 'train'): valid_path = line.strip().split(' ')[-1] if (valid_path[0] != '/'): valid_path = workspace + valid_path lists = open(valid_path).readlines() images = [x.strip() for x in lists] bboxes = [] for image in images: label = image.replace('.jpg', '.txt') lines = open(label).readlines() for line in lines: splitline = line.split(' ') # bboxes.append([float(x)*13. for x in splitline[-2:]]) bboxes.append([float(splitline[-2])*1., float(splitline[-1])*1.]) print(len(bboxes)) # samples = random.sample(bboxes, 15000) # print(len(samples)) bboxes = np.array(bboxes) # samples = np.array(samples) # print(samples.shape) KMeans = KMeansClusterer(5, negIoU, repeats=1) # clusters = KMeans.cluster(samples, True) clusters = KMeans.cluster(bboxes, True) centroids = KMeans.means() print(np.array(centroids) / np.array((1., 1.)))
# create counter and idf vectors count_vect = TfidfVectorizer (stop_words=stop_words) count_vect.fit(df['Content']) #12266 X_train_counts = count_vect.transform(df['Content'] # reduce size of vector with LSI svd = TruncatedSVD(n_components=5) X_train_counts = svd.fit_transform(X_train_counts) # Clustering kclusterer = KMeansClusterer(num_means = 5, distance=cosine_distance, repeats=25, avoid_empty_clusters= True) clusters = kclusterer.cluster(X_train_counts, assign_clusters=True) # print "Clusters:\n " , clusters # print "Means" , kclusterer.means() # Prepare results Matrix categories_map={ 'Politics': 0, 'Business': 1, 'Film': 2, 'Technology': 3, 'Football': 4 }
def main(argv): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='input file', required=True) parser.add_argument('-s', '--step', help='step', required=True) parser.add_argument('-ik', '--init_k', help='K initial', required=True) parser.add_argument('-fk', '--final_k', help='K final', required=True) parser.add_argument('-od', '--distortion_out_file', help='elbow distortion graph file', required=True) parser.add_argument('-os', '--silhouette_out_file', help='elbow silhoutte graph', required=True) parser.add_argument('-pca', '--pca', help='with pca', action='store_true') parser.add_argument('-k_pca', '--k_pca', help='k pca') ARGS = parser.parse_args() descriptors = load_dataset(ARGS.input_file) if ARGS.pca == True: print("With pca") pca = PCA(n_components=int(ARGS.k_pca)) descriptors = pca.fit_transform(descriptors) ks = [] distortions = [] silhouettes = [] for k in range(int(ARGS.init_k), int(ARGS.final_k), int(ARGS.step)): # kmeanModel = KMeans(n_clusters=k, init='k-means++') # kmeanModel.fit(descriptors) # predictions = kmeanModel.predict(descriptors) # cluster_centers_ = kmeanModel.cluster_centers_ kclusterer = KMeansClusterer( k, distance=nltk.cluster.util.cosine_distance) predictions = kclusterer.cluster(descriptors, assign_clusters=True) predictions = np.array(predictions) cluster_centers_ = np.array(kclusterer.means()) distortion = sum( np.min(distance.cdist(descriptors, cluster_centers_, 'cosine'), axis=1)) / descriptors.shape[0] silhouette_score = metrics.silhouette_score(descriptors, predictions, metric='cosine') distortions.append(distortion) silhouettes.append(silhouette_score) ks.append(k) print("k:", k, "distortion:", distortion, "Silhouette Coefficient", silhouette_score) # Plot the elbow with distortion fig = plt.figure() plt.plot(ks, distortions, 'bx-') plt.grid() plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method') fig.savefig(ARGS.distortion_out_file) # Plot the elbow with distortion fig = plt.figure() plt.plot(ks, silhouettes, 'bx-') plt.grid() plt.xlabel('k') plt.ylabel('Silhouette Score') plt.title('Silhouette Score analysis') fig.savefig(ARGS.silhouette_out_file)
def cluster(self, docs_repr): kclusterer = KM(self.n_clusters, distance=cosine_distance, repeats=25,avoid_empty_clusters=True) assigned_clusters = kclusterer.cluster(docs_repr, assign_clusters=True) return assigned_clusters
# print(kmeans) # #Plot the clusters obtained using k means # fig = plt.figure() # ax = fig.add_subplot(111) # scatter = ax.scatter(big_data_copy['Accounting'],big_data_copy['3D Printing'], # c=kmeans[0],s=50) # plt.colorbar(scatter) # this one is not working out...dataframe might not be correct format NUM_CLUSTERS = 10 kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(big_data_copy, assign_clusters=True) '''NEW PLAN, thanks Evan ONE HOT ENCODIGN BUT WITH ADDED UP VECTROS ex: Math 1 Art 2 Math 3 CS 50 Joe 1 0 0 0 Bob 0 0 1 0 Smith 1 0 0 0 Bob 0 1 0 0 Smith 0 0 0 1 groupByIndividual alphabetical is fine probably, jsut want them to be same name next to each other