def cluster_test(self, test_file, clusters=10): df_test1 = pd.read_csv(test_file) output = {} for K in clusters: vectors = list() y_true = list() sections = dict() idx = 0 for word, section, y in df_test1.values: sliceIdx = self.yearDict[str(y)] if word in self.vocabularies[sliceIdx]: if section not in sections: sections[section] = idx idx += 1 y_true.append(sections[section]) vectors.append(self.matrices_norm[sliceIdx][ self.vocabularies[sliceIdx][word]]) skm = SphericalKMeans(n_clusters=K, max_iter=100000) skm.fit(np.array(vectors)) metric = normalized_mutual_info_score(skm.predict( np.array(vectors)), y_true, average_method='arithmetic') y_true_bool = [(triplet1 == triplet2) for triplet2 in y_true for triplet1 in y_true] y_pred = skm.predict(np.array(vectors)) y_pred_bool = [(triplet1 == triplet2) for triplet2 in y_pred for triplet1 in y_pred] metric2 = fbeta_score(y_true_bool, y_pred_bool, beta=5) output[f'NMI({K})'] = metric output[f'F_beta-score({K})'] = metric2 return output
def SphericalKMeans_model(vocab_embeddings, vocab, topics, rerank, rand, weights): spkmeans = SphericalKMeans(n_clusters=topics, random_state=rand).fit(vocab_embeddings, sample_weight=weights) m_clusters = spkmeans.predict(vocab_embeddings, sample_weight=weights) centers = np.array(spkmeans.cluster_centers_) indices = [] for i in range(topics): topk_vals = sort_closest_cossine_center(centers[i], m_clusters, vocab_embeddings, i) if rerank: indices.append(find_top_k_words(100, topk_vals, vocab)) else: indices.append(find_top_k_words(10, topk_vals, vocab)) # print(indices) return m_clusters, indices
def analyse(methode, preproc, true_label, nb_clusters=3, normalizer=True, scikit=True): if scikit: data = methode.fit_transform(preproc) else: data = preproc if normalizer: data = Normalizer(norm='l2', copy=False).fit_transform(data) skplt.cluster.plot_elbow_curve(SphericalKMeans(random_state=42, n_jobs=-1), data, title="Elbow Curve avec Spherical K-means", cluster_ranges=range(1, 15)) skplt.cluster.plot_elbow_curve(KMeans(random_state=42, n_jobs=-1, precompute_distances=True), data, title="Elbow Curve avec K-means", cluster_ranges=range(1, 15)) ("Fitting For Spherical K-means for ", nb_clusters, "...") skmeans = SphericalKMeans(n_clusters=nb_clusters, random_state=42, n_jobs=-1).fit(data) ("Fitting For Spherical K-means for ", nb_clusters, "...") kmeans = KMeans(n_clusters=nb_clusters, random_state=42, n_jobs=-1, precompute_distances=True).fit(data) y_pred_skmeans = skmeans.predict(data) y_pred_kmeans = kmeans.predict(data) print("Results from Spherical K-means") scoring_cluster(skmeans, true_label, y_pred_skmeans) print("Results from K-means") scoring_cluster(kmeans, true_label, y_pred_kmeans) return methode, skmeans, kmeans, data
lse_latent = lse(sum_adj, 4, regularizer=None) latent = lse_latent pairplot(latent, labels=simple_class_labels, title=embed) for k in range(MIN_CLUSTERS, MAX_CLUSTERS + 1): run_name = f"k = {k}, {cluster}, {embed}, right hemisphere (sum), PTR, raw" print(run_name) print() # Cluster # gmm = GaussianCluster(min_components=k, max_components=k, **gmm_params) # gmm.fit(latent) skmeans = SphericalKMeans(n_clusters=k, **skmeans_params) skmeans.fit(latent) pred_labels = skmeans.predict(latent) # ARI base_dict = { "K": k, "Cluster": cluster, "Embed": embed, "Method": f"{cluster} o {embed}", "Score": skmeans.inertia_, } mb_ari = sub_ari(known_inds, mb_labels, pred_labels) mb_ari_dict = base_dict.copy() mb_ari_dict["ARI"] = mb_ari mb_ari_dict["Metric"] = "MB ARI" out_dicts.append(mb_ari_dict)
swapRB=True, crop=False) embedder.setInput(faceBlob) vec = embedder.forward() vectors.append(vec.flatten()) identities.append(int(identity[1:])) frames.append(int(frame)) identities = np.array(identities) frames = np.array(frames) vectors = np.array(vectors) print(vectors.shape, frames.shape, identities.shape, len(imagePaths)) df = pd.DataFrame(vectors, columns=[str(k) for k in range(0, 128)]) df['frames'] = frames df['identities'] = identities X = df.loc[:, '0':'127'] Y = df['identities'] print("[INFO] Finding cluster centroids ...") skm = SphericalKMeans(n_clusters=100, verbose=1, n_jobs=-2, random_state=1) skm.fit(X) labels = skm.predict(X) df['labels'] = labels df = df.sort_values(by='identities') df.to_csv(tracks_path + "/embedding.csv", index=False) print("[INFO] embeddings complete ...") print("[INFO] Linking ...")