def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs): """ Finds cluster of users in data using Gaussian Mixture Models. :param data: pd.DataFrame with features for clustering indexed by users (sessions) :param max_n_clusters: maximal number of clusters for automatic selection for number of clusters. if None, then use n_clusters from arguments :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time) :param random_state: random state for GaussianMixture clusterer :param kwargs: keyword arguments for sklearn.mixture.GaussianMixture :return: np.array of clusters """ if max_n_clusters is not None: kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters, random_state, **kwargs) else: kmargs = { i: j for i, j in kwargs.items() if i in GaussianMixture.get_params(GaussianMixture) } kmargs.update({'random_state': random_state}) km = GaussianMixture(**kmargs) cl = km.fit_predict(data.values) km.labels_ = cl bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) return cl, metrics
def em(X, **kwargs): """ Simply wrapper for the EM algorithm because .fit does not create the attribute labels_ """ model = GaussianMixture(**kwargs) labels = model.fit_predict(X) model.labels_ = labels return model
def GMM(data, max_n_clusters=None, use_csi=True, random_state=0, **kwargs): """ Finds cluster of users in data using Gaussian Mixture Models. Parameters -------- data: pd.DataFrame Dataframe with features for clustering indexed as in ``retention_config.index_col`` max_n_clusters: int, optional Maximal number of clusters for automatic selection for number of clusters. If ``None``, then uses ```n_clusters`` from arguments. Default: `None``` use_csi: bool, optional If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True`` random_state: int, optional Random state for GaussianMixture clusterer. kwargs: optional Parameters for ``sklearn.mixture.GaussianMixture`` Returns -------- Array of clusters Return type -------- np.array """ if max_n_clusters is not None: kmargs = find_best_n_clusters(data, GaussianMixture, max_n_clusters, random_state, **kwargs) else: kmargs = { i: j for i, j in kwargs.items() if i in GaussianMixture.get_params(GaussianMixture) } kmargs.update({'random_state': random_state}) km = GaussianMixture(**kmargs) cl = km.fit_predict(data.values) km.labels_ = cl bs = pd.get_dummies(cl) bs.index = data.index metrics = calc_all_metrics(data, km) if use_csi: metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs) return cl, metrics
print("Part 1: Clustering") print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) print(79 * '_') print('% 9s' % 'init\ttime\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette\tAccuracy') kmeans = KMeans(n_clusters=2, random_state=0).fit(data) float(sum(kmeans.labels_ == labels))/float(len(labels)) metrics.homogeneity_score(labels,kmeans.labels_) metrics.completeness_score(labels, kmeans.labels_) EMax = GaussianMixture(n_components=20,random_state=0).fit(data) # EMax = GMM(n_components=2,random_state=0).fit(data) EMax.labels_ = EMax.predict(data) float(sum(EMax.labels_ == labels))/float(len(labels)) metrics.homogeneity_score(labels,EMax.labels_) metrics.completeness_score(labels, EMax.labels_) def bench_k_means(estimator, name, data): t0 = time() estimator.fit(data) print('% 9s\t%.2fs\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, (time() - t0), metrics.homogeneity_score(labels, estimator.predict(data)), metrics.completeness_score(labels, estimator.predict(data)), metrics.v_measure_score(labels, estimator.predict(data)), metrics.adjusted_rand_score(labels, estimator.predict(data)), metrics.adjusted_mutual_info_score(labels, estimator.predict(data)), metrics.silhouette_score(data, estimator.predict(data),metric='euclidean',sample_size=sample_size),
def gaussian_mixture( X, n_clusters=5, covariance_type="full", best_model=False, max_clusters=10, random_state=None, **kwargs, ): """Clustering with Gaussian Mixture Model. Parameters ---------- X : array-like n x k attribute data n_clusters : int, optional, default: 5 The number of clusters to form. covariance_type: str, optional, default: "full"" The covariance parameter passed to scikit-learn's GaussianMixture algorithm best_model: bool, optional, default: False Option for finding endogenous K according to Bayesian Information Criterion max_clusters: int, optional, default:10 The max number of clusters to test if using `best_model` option random_state: int, optional, default: None The seed used to generate replicable results kwargs Returns ------- fitted cluster instance: sklearn.mixture.GaussianMixture """ if random_state is None: warn( "Note: Gaussian Mixture Clustering is probabilistic--" "cluster labels may be different for different runs. If you need consistency, " "you should set the `random_state` parameter") if best_model is True: # selection routine from # https://plot.ly/scikit-learn/plot-gmm-selection/ lowest_bic = np.infty bic = [] maxn = max_clusters + 1 n_components_range = range(1, maxn) cv_types = ["spherical", "tied", "diag", "full"] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture( n_components=n_components, random_state=random_state, covariance_type=cv_type, ) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) model = best_gmm else: model = GaussianMixture( n_components=n_clusters, random_state=random_state, covariance_type=covariance_type, ) model.fit(X) model.labels_ = model.predict(X) return model
def identify_subnetworks_sub(self, geneset_obj): #[1] edge_path = self.subnetworks_dir + ss + "%s_edges.txt" % geneset_obj.id edge_file = open(edge_path, 'w') #[2] self.gene_dic = {} for i, gene_id in enumerate(geneset_obj.gene_id_list): gene_obj = Gene() gene_obj.id = gene_id gene_obj.index = i + 1 self.gene_dic[gene_id] = gene_obj self.gene_dic[gene_obj.index] = gene_obj ##End for #[2] for a, b in itertools.combinations(geneset_obj.gene_id_list, 2): #[2-1] key = a, b if key not in self.ppi_set: continue ##End if #[2-2] c, d = [self.gene_dic[x] for x in [a, b]] edge_line = make_line([c.index, d.index], tt) edge_file.write(edge_line + nn) ##End for #[3] edge_file.close() embedding_path = self.subnetworks_dir + ss + "%s_embeddings.txt" % geneset_obj.id log_path = self.subnetworks_dir + ss + "%s_log.txt" % geneset_obj.id cmd = "nohup deepwalk --input %s --output %s --representation-size 8 --seed 0 > %s" % ( edge_path, embedding_path, log_path) os.system(cmd) #[4] embedding_file = open(embedding_path, 'r') embedding_file.readline() gene_id_list = [] gene_embedding_arr = [] for embedding_line in embedding_file: index = int(embedding_line.split()[0]) gene_id = self.gene_dic[index].id embedding_vec = [float(x) for x in embedding_line.split()[1:]] gene_id_list.append(gene_id) gene_embedding_arr.append(embedding_vec) ##End for #[5] gene_embedding_arr = np.array(gene_embedding_arr) clusterer_list = [] max_clusters = int(len(gene_id_list)**0.5) + 1 for n_clusters in range(2, max_clusters): clusterer_obj = GaussianMixture(n_components=n_clusters, random_state=0) clusterer_obj.fit(gene_embedding_arr) clusterer_obj.score_ = -clusterer_obj.bic(gene_embedding_arr) clusterer_obj.n_clusters_ = n_clusters clusterer_obj.labels_ = clusterer_obj.predict(gene_embedding_arr) clusterer_list.append(clusterer_obj) ##End for #[6] clusterer_obj = sorted(clusterer_list, key=lambda x: x.score_, reverse=True)[0] subnet_dic = {} for gene_id, label in zip(gene_id_list, clusterer_obj.labels_): #[6-1] if label not in subnet_dic: subnet_obj = Subnet() subnet_obj.gene_id_list = [] subnet_dic[label] = subnet_obj ##End if #[6-2] subnet_obj = subnet_dic[label] subnet_obj.gene_id_list.append(gene_id) ##End for #[7] subnet_list = sorted(subnet_dic.values(), key=lambda x: len(x.gene_id_list), reverse=True) subnet_list = list( filter(lambda x: len(x.gene_id_list) > 2, subnet_list)) for i, subnet_obj in enumerate(subnet_list): subnet_obj.id = "%s_%s" % (geneset_obj.id, i + 1) gene_id_line = make_line(sorted(subnet_obj.gene_id_list), cc) subnet_line = make_line([subnet_obj.id, gene_id_line], tt) self.output_file.write(subnet_line + nn)