def test_classification(): from sklearn.datasets import load_digits from sklearn.cross_validation import KFold from sklearn.metrics import normalized_mutual_info_score digits = load_digits() X, y = digits.data, digits.target folds = 3 cv = KFold(y.shape[0], folds) total = 0.0 oo_score_bag = [] for tr, te in cv: mlp = MLPClassifier(use_dropout=True, n_hidden=200, lr=1.) print(mlp) mlp.fit(X[tr], y[tr], max_epochs=100, staged_sample=X[te]) t = normalized_mutual_info_score(mlp.predict(X[te]), y[te]) print("Fold training accuracy: %f" % t) total += t this_score = [] for i in mlp.oo_score: this_score.append(normalized_mutual_info_score(i, y[te])) oo_score_bag.append(this_score) from matplotlib import pyplot as plt plt.plot(oo_score_bag[0]) plt.show() print("training accuracy: %f" % (total / float(folds)))
def evaluate_label(A, H, W, corr, K): label = H.argmax(axis=1) km = KMeans(K) label2 = km.fit_predict(H) nmi = normalized_mutual_info_score(label, corr) nmi2 = normalized_mutual_info_score(label2, corr) print("NMI by argmax: " + str(nmi)) print("NMI by kmeans: " + str(nmi2)) A = np.matrix(A) W = np.matrix(W) H = np.matrix(H) loss = np.power(A - W * H.T, 2).sum() print(loss) return nmi, nmi2, loss
def loss_augmented_fit(self, feat, y, loss_mult): """Fit K-Medoids to the provided data.""" self._check_init_args() # Check that the array is good and attempt to convert it to # Numpy array if possible. feat = self._check_array(feat) # Apply distance metric to get the distance matrix. pdists = pairwise_distance_np(feat) num_data = feat.shape[0] candidate_ids = list(range(num_data)) candidate_scores = np.zeros(num_data,) subset = [] k = 0 while k < self.n_clusters: candidate_scores = [] for i in candidate_ids: # push i to subset. subset.append(i) marginal_cost = -1.0 * np.sum(np.min(pdists[:, subset], axis=1)) loss = 1.0 - metrics.normalized_mutual_info_score( y, self._get_cluster_ics(pdists, subset)) candidate_scores.append(marginal_cost + loss_mult * loss) # remove i from subset. subset.pop() # push i_star to subset. i_star = candidate_ids[np.argmax(candidate_scores)] subset.append(i_star) # remove i_star from candidate indices. candidate_ids.remove(i_star) k += 1 # Expose labels_ which are the assignments of # the training data to clusters. self.labels_ = self._get_cluster_ics(pdists, subset) # Expose cluster centers, i.e. medoids. self.cluster_centers_ = feat.take(subset, axis=0) # Expose indices of chosen cluster centers. self.center_ics_ = subset # Expose the score = -\sum_{i \in V} min_{j \in S} || x_i - x_j || self.score_ = np.float32(-1.0) * self._get_facility_distance(pdists, subset) self.score_aug_ = self.score_ + loss_mult * ( 1.0 - metrics.normalized_mutual_info_score( y, self._get_cluster_ics(pdists, subset))) self.score_aug_ = self.score_aug_.astype(np.float32) # Expose the chosen cluster indices. self.subset_ = subset return self
def get_normalized_mutual_info(standard_file, prediction_file, isjson=False, isint=False): """Get normalized mutual information (NMI) [Strehl2002]_. Parameters ---------- standard_file : str The ground truth or standard filename. prediction_file : str The analyzed or predicted filename. isjson : bool The flag for standard_file. isint : bool The flag for value in prediction_file. Returns ------- normalized_mutual_info : float Normalized mutual information score. References ---------- .. [Strehl2002] Alexander Strehl and Joydeep Ghosh. Cluster ensembles A knowledge reuse framework for combining multiple partitions. Journal of Machine Learning Research, 3(Dec):583-617, 2002. """ if isjson: standard_data = AbstractionUtility.read_json(standard_file) standard_labels = standard_data.values() else: standard_labels = ExternalEvaluation.get_evaluated(standard_file) prediction_labels = ExternalEvaluation.get_evaluated(prediction_file, isint=isint) normalized_mutual_info = metrics.normalized_mutual_info_score(standard_labels, prediction_labels) return normalized_mutual_info
def evaluate(self): ARI = round(metrics.adjusted_rand_score(self.labels, self.pred), 4) AMI = round(metrics.adjusted_mutual_info_score(self.labels, self.pred), 4) NMI = round(metrics.normalized_mutual_info_score(self.labels, self.pred), 4) print("Adjusted Rand index:", "%.4f" % ARI) print("Adjusted Mutual Information:", "%.4f" % AMI) print("Normalized Mutual Information:", "%.4f" % NMI)
def pam_augmented_fit(self, feat, y, loss_mult): pam_max_iter = 5 self._check_init_args() feat = self._check_array(feat) pdists = pairwise_distance_np(feat) self.loss_augmented_fit(feat, y, loss_mult) print('PAM -1 (before PAM): score: %f, score_aug: %f' % ( self.score_, self.score_aug_)) # Initialize from loss augmented facility location subset = self.center_ics_ for iter_ in range(pam_max_iter): # update the cluster assignment cluster_ics = self._get_cluster_ics(pdists, subset) # update the medoid for each clusters self._augmented_update_medoid_ics_in_place(pdists, y, cluster_ics, subset, loss_mult) self.score_ = np.float32(-1.0) * self._get_facility_distance( pdists, subset) self.score_aug_ = self.score_ + loss_mult * ( 1.0 - metrics.normalized_mutual_info_score( y, self._get_cluster_ics(pdists, subset))) self.score_aug_ = self.score_aug_.astype(np.float32) print('PAM iter: %d, score: %f, score_aug: %f' % (iter_, self.score_, self.score_aug_)) self.center_ics_ = subset self.labels_ = cluster_ics return self
def compare(method1, method2, fig=False): X1 = np.load('{0}_{1}_X_2d.npy'.format(species, method1)) X2 = np.load('{0}_{1}_X_2d.npy'.format(species, method2)) print 'n_cluster\tHomo\tCompl\tNMI\tARI' for i in range(2, 6): clust1 = Clustering(species, method1, X1, None, n_clusters=i) clust2 = Clustering(species, method2, X2, None, n_clusters=i) clust1.agglomerative(linkage='ward') clust2.agglomerative(linkage='ward') label1 = clust1.pred_labels('ward') label2 = clust2.pred_labels('ward') if i == 3 and fig: names = np.unique(label1) figName = '{0}_{1}_on_{2}'.format(species, method1, method2) plot2d(X2, label1, names, figName, figName) names = np.unique(label2) figName = '{0}_{1}_on_{2}'.format(species, method2, method1) plot2d(X1, label2, names, figName, figName) print '{0}\t{1}\t{2}\t{3}\t{4}\n'.format(i, metrics.homogeneity_score(label1, label2), metrics.completeness_score(label1, label2), metrics.normalized_mutual_info_score(label1, label2), metrics.adjusted_rand_score(label1, label2))
def test_diffusion_embedding_two_components_no_diffusion_time(seed=36): """Test spectral embedding with two components""" random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 geom_params = {'laplacian_method':'geometric'} se_precomp = SpectralEmbedding(n_components=1, random_state=np.random.RandomState(seed), eigen_solver = 'arpack', diffusion_maps = True, geom = geom_params) embedded_coordinate = se_precomp.fit_transform(affinity, input_type='affinity') # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
def plotMI(dat, lab, width = 0.35, signed = 0): ''' Draw a bar chart of the normalized MI between each X and Y ''' X = dat.drop(lab, 1) Y = dat[[lab]].values cols = X.columns.values mis = [] #Start by getting MI for c in cols: mis.append(skm.normalized_mutual_info_score(Y.ravel(), X[[c]].values.ravel())) #Get signs by correlation corrs = dat.corr()[lab] corrs[corrs.index != lab] df = pd.DataFrame(list(zip(mis, cols)), columns = ['MI', 'Lab']) df = pd.concat([df, pd.DataFrame(list(corrs), columns = ['corr'])], axis=1, join_axes=[df.index]) if signed == 0: makeBar(df, 'MI', 'Lab', width) else: makeBarSigned(df, 'MI', 'Lab', width)
def compare_direct_undir(): from sklearn import metrics g = gt.Graph.Read_GraphML('ed_tag.graphml') gt.net_stat(g) gu = gt.Graph.Read_GraphML('ed_tag_undir.graphml') gt.net_stat(gu) com = g.community_infomap(edge_weights='weight', vertex_weights='weight') comu1 = gu.community_infomap(edge_weights='weight', vertex_weights='weight') comu2 = gu.community_infomap(edge_weights='weight', vertex_weights='weight') mem = com.membership memu1 = comu1.membership memu2 = comu2.membership print metrics.adjusted_rand_score(mem, memu1) print metrics.normalized_mutual_info_score(mem, memu1) print metrics.adjusted_rand_score(memu2, memu1) print metrics.normalized_mutual_info_score(memu2, memu1)
def test_spectral_embedding_two_components(seed=36): """Test spectral embedding with two components""" random_state = np.random.RandomState(seed) n_sample = 100 affinity = np.zeros(shape=[n_sample * 2, n_sample * 2]) # first component affinity[0:n_sample, 0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # second component affinity[n_sample::, n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2 # connection affinity[0, n_sample + 1] = 1 affinity[n_sample + 1, 0] = 1 affinity.flat[::2 * n_sample + 1] = 0 affinity = 0.5 * (affinity + affinity.T) true_label = np.zeros(shape=2 * n_sample) true_label[0:n_sample] = 1 se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed", random_state=np.random.RandomState(seed)) embedded_coordinate = se_precomp.fit_transform(affinity) # Some numpy versions are touchy with types embedded_coordinate = \ se_precomp.fit_transform(affinity.astype(np.float32)) # thresholding on the first components using 0. label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float") assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
def calculate_nmi_kmeans(H, correct_label): km = KMeans(H.shape[1]) try: com = km.fit_predict(H) except: com = [0] * H.shape[0] nmi = normalized_mutual_info_score(com, correct_label) return nmi
def cluster_metrics(labels_1, labels_2): print("\n".join( [ "Normalized Mutual Information: %f" % (normalized_mutual_info_score(labels_1, labels_2)), "Adjusted Rand Score: %f" % (adjusted_rand_score(labels_1, labels_2)), "Homogeneity: %f" % (homogeneity_score(labels_1, labels_2)), "Completeness: %f" % (completeness_score(labels_1, labels_2)) ] ))
def normalized_mutual_info_score_scorefunc(X, y): scores = [] pvals = [] for col in range(X.shape[1]): scores.append(normalized_mutual_info_score(X[:, col], y)) pvals.append(1) return np.array(scores), np.array(pvals)
def model_metrics(model, X, y, batch_size): loss_and_metrics = model.evaluate(X, y, batch_size=batch_size) predicted_classes = model.predict_classes(X, batch_size=batch_size) predicted_probas = model.predict_proba(X, batch_size=batch_size) accuracy = loss_and_metrics[1] roc_auc = roc_auc_score(y, predicted_probas) nmi = normalized_mutual_info_score(y, predicted_classes.flatten()) return accuracy, roc_auc, nmi
def calc(gr_truth, predicted): # precision, recall, fscore, _ = score(gr_truth, predicted, average='micro') # print('precision: {}'.format(precision)) # print('recall: {}'.format(recall)) # print('fscore: {}'.format(fscore)) # print('jaccard: {}'.format(jaccard_similarity_score(gr_truth, predicted, normalize=True))) # print('mutual: {}'.format(mutual_info_score(gr_truth, predicted))) # print('mutual adj: {}'.format(adjusted_mutual_info_score(gr_truth, predicted))) # print('mutual norm: {}'.format(normalized_mutual_info_score(gr_truth, predicted))) return normalized_mutual_info_score(gr_truth, predicted)
def observe_correlations(nrows=1000000): training_file = "data/train.csv" dataframe = pd.read_csv(training_file,nrows=nrows,header=0) info_variable=pd.DataFrame(index=dataframe.columns[2:]) info_variable["levels"]=dataframe.iloc[:,2:].apply(lambda t: np.unique(t).shape[0]) info_variable["Entropy"]=dataframe.iloc[:,2:].apply(lambda t: entropy(t)) info_variable["MutualInformation"]=dataframe.iloc[:,2:].apply(lambda t: metrics.normalized_mutual_info_score(dataframe["click"],t)) info_variable["clustering"]=cluster_var(dataframe) info_variable=info_variable.sort("clustering") return info_variable
def mRMR(dfin, target, adjusted = False, n_features_to_select = 10): ''' :param dfin: dataframe of features :param target: target (0,1) :return: ''' df = dfin.copy() final_features = [] importance = [] if n_features_to_select > df.shape[1]: n_features_to_select = df.shape[1]-1 #Iteratively create a subset of final features the feature that maximizes # MI(C, X_j) - (1/(m-1) * SUM_i (MI(X_i, X_j)) #where C = the ultimate target, X_i = the feature to test, and X_j = features already selected while len(final_features) < n_features_to_select: features = np.array([c for c in df.columns.tolist() if c not in final_features]) if adjusted: mi = np.array([adjusted_mutual_info_score(df[f], target) for f in features]) if len(final_features) > 0: mr = np.array([normalized_mutual_info_score(df[x_j], df[x_i]) for x_j in features for x_i in features]) else: mr = np.zeros(len(features)) else: mi = [normalized_mutual_info_score(df[f], target) for f in features] if len(final_features) > 0: mr = np.array([normalized_mutual_info_score(df[x_j], df[x_i]) for x_j in features for x_i in features]) else: mr = np.zeros(len(features)) if len(final_features) > 0: mr = (1./(len(features) - 1)) * mr.reshape(len(features), len(features)).sum(axis = 1) mrmr = mi - mr final_features.append(features[mrmr == max(mrmr)][0]) importance.append(max(mrmr)) return final_features, importance
def getClusterMetricString(method_name, labels_true, labels_pred): ''' Creates a formatted string containing the method name and acc, nmi metrics - can be used for printing :param method_name: Name of the clustering method (just for printing) :param labels_true: True label for each sample :param labels_pred: Predicted label for each sample :return: Formatted string containing metrics and method name ''' acc = cluster_acc(labels_true, labels_pred) nmi = metrics.normalized_mutual_info_score(labels_true, labels_pred) return '%-50s %8.3f %8.3f' % (method_name, acc, nmi)
def compare_clusters(nn_clusters, tf_clusters): """prints some comparisons""" print('~~Adjusted mutual information: {}'.format( metrics.adjusted_mutual_info_score(nn_clusters.labels_, tf_clusters.labels_))) print('~~Normalized mutual information: {}'.format( metrics.normalized_mutual_info_score(nn_clusters.labels_, tf_clusters.labels_))) print('~~Adjusted Rand Index: {}'.format( metrics.adjusted_rand_score(nn_clusters.labels_, tf_clusters.labels_)))
def test_pipeline_spectral_clustering(seed=36): # Test using pipeline to do spectral clustering random_state = np.random.RandomState(seed) se_rbf = SpectralEmbedding(n_components=n_clusters, affinity="rbf", random_state=random_state) se_knn = SpectralEmbedding( n_components=n_clusters, affinity="nearest_neighbors", n_neighbors=5, random_state=random_state ) for se in [se_rbf, se_knn]: km = KMeans(n_clusters=n_clusters, random_state=random_state) km.fit(se.fit_transform(S)) assert_array_almost_equal(normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2)
def _augmented_update_medoid_ics_in_place(self, pdists, y_gt, cluster_ics, medoid_ics, loss_mult): for cluster_idx in range(self.n_clusters): # y_pred = self._get_cluster_ics(D, medoid_ics) # Don't prematurely do the assignment step. # Do this after we've updated all cluster medoids. y_pred = cluster_ics if sum(y_pred == cluster_idx) == 0: # Cluster is empty. continue curr_score = ( -1.0 * np.sum( pdists[medoid_ics[cluster_idx], y_pred == cluster_idx]) + loss_mult * (1.0 - metrics.normalized_mutual_info_score( y_gt, y_pred))) pdist_in = pdists[y_pred == cluster_idx, :] pdist_in = pdist_in[:, y_pred == cluster_idx] all_scores_fac = np.sum(-1.0 * pdist_in, axis=1) all_scores_loss = [] for i in range(y_pred.size): if y_pred[i] != cluster_idx: continue # remove this cluster's current centroid medoid_ics_i = medoid_ics[:cluster_idx] + medoid_ics[cluster_idx + 1:] # add this new candidate to the centroid list medoid_ics_i += [i] y_pred_i = self._get_cluster_ics(pdists, medoid_ics_i) all_scores_loss.append(loss_mult * ( 1.0 - metrics.normalized_mutual_info_score(y_gt, y_pred_i))) all_scores = all_scores_fac + all_scores_loss max_score_idx = np.argmax(all_scores) max_score = all_scores[max_score_idx] if max_score > curr_score: medoid_ics[cluster_idx] = np.where( y_pred == cluster_idx)[0][max_score_idx]
def print_cluster_measures(som, test_data, test_labels): """Prints some clustering statistics.""" pred_labels = som.predict(test_data) accuracy = metrics.accuracy_score(test_labels, pred_labels) adj_rand = metrics.adjusted_rand_score(test_labels, pred_labels) adj_mi = metrics.adjusted_mutual_info_score(test_labels, pred_labels) norm_mi = metrics.normalized_mutual_info_score(test_labels, pred_labels) print(' Accuracy:', accuracy) print(' Adjusted rand score:', adj_rand) print(' Adjusted mutual info:', adj_mi) print(' Normalized mutual info:', norm_mi)
def compute_metrics(graph): results = [] results.append(('kmeans',cluster_kmeans(G))) results.append(('agglo',cluster_agglomerative(G))) results.append(('spectral',cluster_spectral(G))) results.append(('affinity',cluster_affinity(G))) metric_results = [(name, metrics.normalized_mutual_info_score(groundTruth,x), metrics.adjusted_rand_score(groundTruth,x)) for name,x in results] return metric_results
def ceEvalMutual(cluster_runs, cluster_ensemble = None, verbose = False): """Compute a weighted average of the mutual information with the known labels, the weights being proportional to the fraction of known labels. Parameters ---------- cluster_runs : array of shape (n_partitions, n_samples) Each row of this matrix is such that the i-th entry corresponds to the cluster ID to which the i-th sample of the data-set has been classified by this particular clustering. Samples not selected for clustering in a given round are are tagged by an NaN. cluster_ensemble : array of shape (n_samples,), optional (default = None) The identity of the cluster to which each sample of the whole data-set belong to according to consensus clustering. verbose : Boolean, optional (default = False) Specifies if status messages will be displayed on the standard output. Returns ------- unnamed variable : float The weighted average of the mutual information between the consensus clustering and the many runs from the ensemble of independent clusterings on subsamples of the data-set. """ if cluster_ensemble is None: return 0.0 if reduce(operator.mul, cluster_runs.shape, 1) == max(cluster_runs.shape): cluster_runs = cluster_runs.reshape(1, -1) weighted_average_mutual_information = 0 N_labelled_indices = 0 for i in xrange(cluster_runs.shape[0]): labelled_indices = np.where(np.isfinite(cluster_runs[i]))[0] N = labelled_indices.size x = np.reshape(checkcl(cluster_ensemble[labelled_indices], verbose), newshape = N) y = np.reshape(checkcl(np.rint(cluster_runs[i, labelled_indices]), verbose), newshape = N) q = normalized_mutual_info_score(x, y) weighted_average_mutual_information += q * N N_labelled_indices += N return float(weighted_average_mutual_information) / N_labelled_indices
def maxNMI(labels,clust): max_NMI = 0.0 for i in range(clust.num_leaves): clusters = clust[i] cluster_labels = np.zeros(len(labels)) for index,cluster_IDs in enumerate(clusters): cluster_labels[cluster_IDs] = index NMI = metrics.normalized_mutual_info_score(labels,cluster_labels) if NMI > max_NMI: max_NMI = NMI return max_NMI
def explore(a, b, ax, xlabel, ylabel): plt.scatter(a, b) plt.xlabel(xlabel) plt.ylabel(ylabel) mi = metrics.normalized_mutual_info_score(a, b) co = stats.pearsonr(a, b) text = "MI = {0:.6}".format(mi) text = text + ", Cor = {0:.6}".format(co[0]) plt.text(0.95, 0.01, text, verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, fontsize=15)
def mutual_information(x, y, nbins=32, normalized=False): """ Compute mutual information :param x: 1D numpy.array : flatten data from an image :param y: 1D numpy.array : flatten data from an image :param nbins: number of bins to compute the contingency matrix (only used if normalized=False) :return: float non negative value : mutual information """ from sklearn.metrics import normalized_mutual_info_score, mutual_info_score if normalized: mi = normalized_mutual_info_score(x, y) else: c_xy = np.histogram2d(x, y, nbins)[0] mi = mutual_info_score(None, None, contingency=c_xy) # mi = adjusted_mutual_info_score(None, None, contingency=c_xy) return mi
def evaluate(self, labels): result = tuple() # true_label provided if self.n_clusters is not None: for label in labels: ami = metrics.adjusted_mutual_info_score(self.y, label) nmi = metrics.normalized_mutual_info_score(self.y, label) vmes = metrics.v_measure_score(self.y, label) ari = metrics.adjusted_rand_score(self.y, label) result = result + (ami, nmi, vmes, ari) ### END - for label ### END - if self.y else: for label in labels: result = result + (metrics.silhouette_score(self.X, label), ) return result
def cluster_var(dataframe): pd.set_option('expand_frame_repr', False) names=[x for x in dataframe.columns if x not in ["click","id","device_id","device_ip"]] m=len(names) MI=pd.DataFrame(np.zeros(shape=(m,m)),index=names,columns=names) for i in range(m): for j in range(i+1,m): MI.iloc[i,j]=metrics.normalized_mutual_info_score(dataframe.loc[:,names[i]],dataframe.loc[:,names[j]]) for i in range(m): for j in range(i+1): if j==i: MI.iloc[i,j]=1 else: MI.iloc[i,j]=MI.iloc[j,i] clustering=KMeans(n_clusters=10).fit_predict(MI) clustering=pd.Series(clustering,index=names) return clustering
def myMechanism(n, e): a = readData() labels = np.zeros(shape=(958, 20)) labels.dtype = 'int64' cluster_centers = np.zeros(shape=(n, 40)) for i in range(20): # print(i) clf = KMeans(n_clusters=n, random_state=9) y_pred = clf.fit_predict(a[:, i, 2:4]) labels[:, i] = clf.labels_ cluster_centers[:, 2 * i:(2 * i + 2)] = clf.cluster_centers_ newpaths = [] for i in range(958): newpath = "" for j in range(20): string = "L" + str(labels[i, j]) newpath += string newpaths.append(newpath) result = Counter(newpaths) newpathsdict = dict(result) # 随机生成轨迹补足数量 while (len(newpathsdict) < 958): key = "" for i in range(20): string = "L" + str(random.randint(0, n - 1)) key += string newpathsdict.setdefault(key, 0) # 补齐2的n次方haar变换和重构 values = list(newpathsdict.values()) a = int(math.log(len(values), 2)) for index in range(int(math.pow(2, a + 1)) - len(values)): values.append(0) temp = buildHaarTreeList(values) # print(len(temp)) noise = addNoise(len(temp), e) c = [temp[i] + noise[i] for i in range(len(temp))] noisecounts = rebuildHaarTreeList(c) i = 0 newpathsdict2 = copy.deepcopy(newpathsdict); for key, value in newpathsdict.items(): newpathsdict[key] = noisecounts[i] i = i + 1 valueslist = sorted(list(newpathsdict.values()), reverse=True) newpathslist = list_sort_by_value(newpathsdict) truecounts = [] for item in newpathslist: truecounts.append(newpathsdict2.get(item)) # 根据一致性约束 trueconts 保序回归 x = np.arange(958) y = np.array(truecounts) y_ = IsotonicRegression(increasing=False).fit_transform(x, y) # print(y.shape) NMI = metrics.normalized_mutual_info_score(y_, y) mapeyy = mape(y_, y) maeyy = metrics.mean_absolute_error(y, y_) # hausdorff_distance y.resize(1,958) y_.resize(1,958) # print(y.shape) hau_dis = hausdorff_distance(y, y_, distance="euclidean") return NMI, hau_dis,mapeyy,maeyy
## Expanding the encodings and the cluster centers. encodingsExpand = encodings.unsqueeze(1).expand( encodings.size(0), 10, 32).cuda() clusterCentersExpand = clusterCenters.clone().unsqueeze(0).expand( encodings.size(0), 10, 32).cuda() ## Computing the distances of the encodings from the cluster centers. distMat = torch.pow(encodingsExpand - clusterCentersExpand, 2).sum(2) ## Computing the cluster center label having the minimum distance from the particular image encoding. _, predClus = torch.min(distMat, dim=1) ## Adding the predictions to the global list. yPredAll.extend(list(predClus.data.cpu().numpy())) ## Adding the true labels to the global list. yTrueAll.extend(list(labelBatchV.data.cpu().numpy())) ## Computing the test NMI and Purity. testPurity = purityScore(yPredAll, yTrueAll) testNMI = normalized_mutual_info_score(yPredAll, yTrueAll) print(testPurity) print(testNMI) # ## Writing data. data = [str(int(argList.percentLabData * 100)), str(testPurity), str(testNMI)] with open('./TestingResults/' + str(argList.dataSet) + '/Test.csv', 'a') as f: writer = csv.writer(f) writer.writerow(data)
import numpy as np from sklearn import metrics file1 = file_name_ground_truth labels_true = np.loadtxt(file1) file2 = file_name_to_check labels = np.loadtxt(file2) n = 70000 # number of points NMI = metrics.normalized_mutual_info_score(labels, labels_true, average_method='arithmetic') clustered = (labels >= 0) ratio_points_clustered = np.sum(clustered) / n n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) print("number of clusters: %d" % n_clusters) print("number of noise: %d" % n_noise) print("ratio of points clustered: %0.3f" % ratio_points_clustered) print("NMI value: %0.3f" % NMI)
def f_mutual_info_score(var1, var2, var1type, var2type): values1, values2 = bin_variables(var1, var1type, var2, var2type) return metrics.normalized_mutual_info_score(values1, values2)
def norm_mut_info(ground_truth, predicted): return metrics.normalized_mutual_info_score(ground_truth, predicted)
n_clusters = n_classes(gt_labels) shapelet_lengths = {} for sz in [int(p * ts_len) for p in [.15, .3, .45]]: n_shapelets = int(numpy.log10(ts_len - sz) * ratio_n_shapelets) # 2, 5, 8, 10 shapelet_lengths[sz] = n_shapelets print(dataset, shapelet_lengths, model_class.__name__) m = model_class(shapelet_lengths, d=data.shape[2], print_loss_every=1000, ada_grad=True, niter=1000, print_approx_loss=True) m.fit(data) for ikiter in range(nkiter): m.partial_fit(data, 1000, (ikiter + 1) * 1000) model_fname = "%s%s_%dkiter_%s_%s.model" % (model_path, m.__class__.__name__, ikiter + 1, dataset, oar_job_id) m.dump_without_dists(model_fname) model_fname = "%s%s_final_%s.model" % (model_path, m.__class__.__name__, dataset) m.dump_without_dists(model_fname) print("Saved model %s with approximate loss: %f (beta=%f)" % (model_fname, m._loss(data), m.beta)) data_shtr = numpy.empty((data.shape[0], sum(shapelet_lengths.values()))) for i in range(data.shape[0]): data_shtr[i] = m._shapelet_transform(data[i]) km = KMeans(n_clusters=n_clusters) pred_labels = km.fit_predict(data_shtr) print("Model=%s, dataset=%s, NMI(kmeans)=%f" % (m.__class__.__name__, dataset, normalized_mutual_info_score(gt_labels, pred_labels)))
def calc_nmi(a, b): a = np.around(a, decimals=2) b = np.around(b, decimals=2) return metrics.normalized_mutual_info_score(a, b)
def bench_desom(X_train, y_train, dataset, map_size, encoder_dims, ae_weights=None): print('*** {} - desom with {} map and {} autoencoder (gamma={})***'.format( dataset, map_size, encoder_dims, gamma)) desom = DESOM(encoder_dims=encoder_dims, map_size=map_size) save_dir = 'results/benchmark/desom-gamma{}_{}_{}_{}x{}'.format( gamma, dataset, optimizer, map_size[0], map_size[1]) subprocess.run(['mkdir', '-p', save_dir]) for run in range(n_runs): desom.initialize() desom.compile(gamma=gamma, optimizer=optimizer) if ae_weights is not None: desom.load_ae_weights(ae_weights) # Weights initialization by randomly sampling training points desom.init_som_weights(X_train) t0 = time.time() desom.fit(X_train, y_train, None, None, iterations, som_iterations, eval_interval, save_epochs, batch_size, Tmax, Tmin, decay, save_dir) dt = time.time() - t0 print('Run {}/{} (took {:f} seconds)'.format(run + 1, n_runs, dt)) y_pred = desom.predict(X_train) pur[run] = cluster_purity(y_train, y_pred) nmi[run] = normalized_mutual_info_score(y_train, y_pred) ari[run] = adjusted_rand_score(y_train, y_pred) acc[run] = cluster_acc(y_train, y_pred) duration[run] = dt if map_size[0] == 8: # Post clustering in latent space print('Post-clustering in latent space...') prototypes = desom.prototypes km_desom = KMeans(n_clusters=np.max(y_train), n_jobs=-1).fit(prototypes) km_desom_pred = km_desom.predict(desom.encode(X_train)) pur_clust[run] = cluster_purity(y_train, km_desom_pred) nmi_clust[run] = normalized_mutual_info_score( y_train, km_desom_pred) ari_clust[run] = adjusted_rand_score(y_train, km_desom_pred) acc_clust[run] = cluster_acc(y_train, km_desom_pred) name = '{} (K={})'.format(dataset, map_size[0] * map_size[1]) results.at[name, 'pur'] = pur.mean() results.at[name, 'pur_std'] = pur.std() results.at[name, 'nmi'] = nmi.mean() results.at[name, 'nmi_std'] = nmi.std() results.at[name, 'ari'] = ari.mean() results.at[name, 'ari_std'] = ari.std() results.at[name, 'acc'] = acc.mean() results.at[name, 'acc_std'] = acc.std() results.at[name, 'duration'] = duration.mean() if map_size[0] == 8: # Post clustering results.at[name, 'pur_clust'] = pur_clust.mean() results.at[name, 'pur_clust_std'] = pur_clust.std() results.at[name, 'nmi_clust'] = nmi_clust.mean() results.at[name, 'nmi_clust_std'] = nmi_clust.std() results.at[name, 'ari_clust'] = ari_clust.mean() results.at[name, 'ari_clust_std'] = ari_clust.std() results.at[name, 'acc_clust'] = acc_clust.mean() results.at[name, 'acc_clust_std'] = acc_clust.std() print(results.loc[name])
def fitmodel(X, pheno, estimator, parameters, modelname, method, performance, times): score_store = 0 kfold = KFold(n_splits=5) for train_index, test_index in kfold.split(X, pheno): # time how long it takes to train each model type start = time.process_time() # split data into train/test sets X_train = X.iloc[train_index] y_train = pheno[train_index] X_test = X.iloc[test_index] y_test = pheno[test_index] # perform grid search to identify best hyper-parameters train_time = time.time() gs_clf =estimator gs_clf.fit(X_train, y_train) train_end = time.time() - train_time # predict resistance in test set predict_time = time.time() y_pred = gs_clf.predict(X_test) y_pred[y_pred < 0.5] = 0 y_pred[y_pred > 0.5] = 1 predict_end = time.time() - predict_time eval_time = time.time() score = balanced_accuracy_score(y_test, y_pred) balanced_accuracy_scoreval = balanced_accuracy_score(y_test, y_pred) average_precision_scoreval = average_precision_score(y_test, y_pred) brier_score_lossval = brier_score_loss(y_test, y_pred) f1_scoreval = f1_score(y_test, y_pred) precision_scoreval = precision_score(y_test, y_pred) recall_scoreval = recall_score(y_test, y_pred) jaccard_scoreval = jaccard_score(y_test, y_pred) roc_auc_scoreval = roc_auc_score(y_test, y_pred) adjusted_mutual_info_scoreval = adjusted_mutual_info_score(y_test, y_pred) adjusted_rand_scoreval = adjusted_rand_score(y_test, y_pred) completeness_scoreval = completeness_score(y_test, y_pred) fowlkes_mallows_scoreval = fowlkes_mallows_score(y_test, y_pred) homogeneity_scoreval = normalized_mutual_info_score(y_test, y_pred) v_measure_scoreval = v_measure_score(y_test, y_pred) explained_variance_scoreval = explained_variance_score(y_test, y_pred) max_errorval = max_error(y_test, y_pred) mean_absolute_errorval = mean_absolute_error(y_test, y_pred) mean_squared_errorval = mean_squared_error(y_test, y_pred) mean_squared_log_errorval = mean_squared_log_error(y_test, y_pred) median_absolute_errorval = median_absolute_error(y_test, y_pred) r2_scoreval = r2_score(y_test, y_pred) eval_end = time.time() - eval_time # Create classifiers gnb = model # ############################################################################# # Plot calibration plots plt.figure(figsize=(10, 10)) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") for clf, name in [ (gnb, '%s'%algo), ]: clf.fit(X_train, y_train) if hasattr(clf, "predict_proba"): prob_pos = clf.predict_proba(X_test)[:, 1] else: # use decision function prob_pos = clf.decision_function(X_test) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) fraction_of_positives, mean_predicted_value = \ calibration_curve(y_test, prob_pos, n_bins=10) ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % (name,)) ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, histtype="step", lw=2) ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="lower right") ax1.set_title('Calibration plots (%s_%s)'% (algo,score)) ax2.set_xlabel("Mean predicted value") ax2.set_ylabel("Count") ax2.legend(loc="upper center", ncol=2) plt.tight_layout() plt.savefig('%s_%s_Calibration_plots.svg' % (algo,score)) performance = np.append(performance, score) method = np.append(method, modelname) times = np.append(times, (time.process_time() - start)) # print("Confusion matrix for this fold") # print(algo, 'Accuracy Score:',score*100) print('###################################################') filename = '%s_Acc: %s.sav' % (algo, score) joblib.dump(gs_clf, filename) print(filename) to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s`,`%s\n" % ( balanced_accuracy_scoreval, average_precision_scoreval, brier_score_lossval, f1_scoreval, precision_scoreval, recall_scoreval, jaccard_scoreval, roc_auc_scoreval, adjusted_mutual_info_scoreval, adjusted_rand_scoreval, completeness_scoreval, fowlkes_mallows_scoreval, homogeneity_scoreval, v_measure_scoreval, explained_variance_scoreval, max_errorval, mean_absolute_errorval, mean_squared_errorval, mean_squared_log_errorval, median_absolute_errorval, r2_scoreval, algo,train_end,predict_end,eval_end) Eval_file.write(str(to_write)) return gs_clf, method, performance, times
def k_means_results(name, A, B, x_label, y_label, colormap): X = A[0] y = A[1] X_test = B[0] y_test = B[1] h = .02 n_clusters = 2 k_means = KMeans(n_clusters=n_clusters) start = time.time() fit_results = k_means.fit(X) end = time.time() print 'Fit Time: ' + str(end - start) Y_kmeans = k_means.predict(X) y_pred = Y_kmeans y_true = y print 'Train Accuracy Score Default' print metrics.accuracy_score(y_true, y_pred) y_pred = map(flip, Y_kmeans) print 'Train Accuracy Score Flip Labels' print metrics.accuracy_score(y_true, y_pred) print 'Classification Report' print metrics.classification_report(y_true, y_pred) print 'Confusion Matrix' print metrics.confusion_matrix(y_true, y_pred) print 'Completeness Score' print metrics.completeness_score(y_true, y_pred) print 'Homogeneity Score' print metrics.homogeneity_score(y_true, y_pred) print 'Homogeneity Completeness V Measured' print metrics.homogeneity_completeness_v_measure(y_true, y_pred) print 'Mutual Information Score' print metrics.mutual_info_score(y_true, y_pred) print 'Normalized Mutual Info Score' print metrics.normalized_mutual_info_score(y_true, y_pred) print 'Silhouette Score' print metrics.silhouette_score(X, fit_results.labels_) print 'Silhouette Samples' print metrics.silhouette_samples(X, fit_results.labels_) print 'V Measure Score' print metrics.v_measure_score(y_true, y_pred) figure_identifier = plt.figure() colors = ['yellow', 'cyan'] if colormap: cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA']) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) for i in xrange(len(colors)): px = X[:, 0][Y_kmeans == i] py = X[:, 1][Y_kmeans == i] plt.scatter(px, py, c=colors[i]) plt.scatter(fit_results.cluster_centers_[0, 0:1], fit_results.cluster_centers_[0, 1:2], s=100, linewidths=4, c='orange', marker='x') plt.scatter(fit_results.cluster_centers_[1, 0:1], fit_results.cluster_centers_[1, 1:2], s=100, linewidths=4, c='orange', marker='x') plt.xlabel(x_label) plt.ylabel(y_label) plt.title(name + ' Train Results') # plt.show() plt.savefig('figures/' + name.replace(' ', '_') + '_Training_results.png') figure_identifier.clf() plt.close(figure_identifier) print_confusion_matrix('Train', Y_kmeans, y) figure_identifier = plt.figure() Y_kmeans = k_means.predict(X_test) y_pred = Y_kmeans y_true = y_test print 'Test Accuracy Score Default' print metrics.accuracy_score(y_true, y_pred) y_pred = map(flip, Y_kmeans) print 'Test Accuracy Score Flip Labels' print metrics.accuracy_score(y_true, y_pred) colors = ['yellow', 'cyan'] if colormap: cmap_light = ListedColormap(['#FF3EFA', '#AAFFAA']) x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1 y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = k_means.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.pcolormesh(xx, yy, Z, cmap=cmap_light) for i in xrange(len(colors)): px = X_test[:, 0][Y_kmeans == i] py = X_test[:, 1][Y_kmeans == i] plt.scatter(px, py, c=colors[i]) plt.scatter(fit_results.cluster_centers_[0, 0:1], fit_results.cluster_centers_[0, 1:2], s=100, linewidths=4, c='orange', marker='x') plt.scatter(fit_results.cluster_centers_[1, 0:1], fit_results.cluster_centers_[1, 1:2], s=100, linewidths=4, c='orange', marker='x') plt.xlabel(x_label) plt.ylabel(y_label) plt.title(name + ' Test Results') # plt.show() plt.savefig('figures/' + name.replace(' ', '_') + '_Test_results.png') print_confusion_matrix('Test', Y_kmeans, y_test) figure_identifier.clf() plt.close(figure_identifier)
sess.run(init_op) # Training cycle # Fit training with Backpropagation using batch data. for epoch in range(n_layers): miniData, _ = trainSet_cosine.next_batch(n_batch) _, new_cost = sess.run([train_optimizer, cost], feed_dict={ x: miniData, mode_train: True }) #------------------------- End of the Optimization ------------------------------ # Save the results after per 10 epochs. # Getting embedded codes and running K-Means. ae_codes_cos = sess.run(code, feed_dict={x: data_cos, mode_train: False}) idx_cos = k_means_(ae_codes_cos, n_clusters) ae_nmi_cos = normalized_mutual_info_score(labels_cos, idx_cos) ae_nmi_cos = ae_nmi_cos * 100 results_cos.append(ae_nmi_cos) steps_cos.append(epoch) loss_cost_cos.append(new_cost) print( "NMI Score for AE is: {:0.2f} and new cost is: {:0.2f} in {:d} step. ". format(ae_nmi_cos, new_cost, epoch)) warnings.filterwarnings('ignore') plt.figure(figsize=(12, 3.5)) plt.subplot(1, 2, 1) plt.ylim(-0.5, 100) plt.plot(steps_cos, loss_cost_cos, label='Cost Trianing for Cosine Distance ', marker='o')
def btnConvert_click(self): msgBox = QMessageBox() # Linkage Linkage = ui.cbLinkage.currentData() # Affinity Affinity = ui.cbAffinity.currentData() # Tree Tree = ui.cbTree.currentData() # NCluster try: NCluster = np.int32(ui.txtNCluster.text()) except: msgBox.setText("Number of Cluster is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Filter try: Filter = ui.txtFilter.text() if not len(Filter): Filter = None else: Filter = Filter.replace("\'", " ").replace(",", " ").replace( "[", "").replace("]", "").split() Filter = np.int32(Filter) except: print("Filter is wrong!") return # OutFile OutFile = ui.txtOutFile.text() if not len(OutFile): msgBox.setText("Please enter out file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData = dict() OutData["ModelAnalysis"] = "Agglomerative" # OutModel OutModel = ui.txtOutModel.text() if not len(OutModel): OutModel = None # InFile InFile = ui.txtInFile.text() if not len(InFile): msgBox.setText("Please enter input file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not os.path.isfile(InFile): msgBox.setText("Input file not found!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False InData = io.loadmat(InFile) # Data if not len(ui.txtData.currentText()): msgBox.setText("Please enter Input Train Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Label if not len(ui.txtLabel.currentText()): msgBox.setText("Please enter Train Input Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False X = InData[ui.txtData.currentText()] L = InData[ui.txtLabel.currentText()][0] try: if Filter is not None: for fil in Filter: # Remove Training Set labelIndx = np.where(L == fil)[0] L = np.delete(L, labelIndx, axis=0) X = np.delete(X, labelIndx, axis=0) print("Class ID = " + str(fil) + " is removed from data.") if ui.cbScale.isChecked(): X = preprocessing.scale(X) print("Whole of data is scaled to N(0,1).") except: print("Cannot load data or label") return try: cls = AgglomerativeClustering(n_clusters=NCluster, affinity=Affinity, compute_full_tree=Tree, linkage=Linkage) print("Run Clustering ...") PeL = cls.fit_predict(X) except Exception as e: print(e) msgBox = QMessageBox() msgBox.setText(str(e)) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return OutData["predict"] = PeL if OutModel is not None: joblib.dump(cls, OutModel) print("Model is saved: " + OutModel) if ui.cbAverage.isChecked(): acc = accuracy_score(L, PeL) OutData["Accuracy"] = acc print("Average {:5.2f}".format(acc * 100)) if ui.cbNMI.isChecked(): NMI = normalized_mutual_info_score(L, PeL) OutData["NMI"] = NMI print("Normalized Mutual Information (NMI) {:7.6f}".format(NMI)) if ui.cbRIA.isChecked(): RIA = adjusted_rand_score(L, PeL) OutData["RIA"] = RIA print("Rand Index Adjusted (RIA) {:7.6f}".format(RIA)) if ui.cbAMI.isChecked(): AMI = adjusted_mutual_info_score(L, PeL) OutData["AMI"] = AMI print("Adjusted Mutual Information (AMI) {:7.6f}".format(AMI)) print("Saving ...") io.savemat(OutFile, mdict=OutData) print("DONE.") msgBox.setText("Agglomerative Clustering is done.") msgBox.setIcon(QMessageBox.Information) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_()
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') print('Bröther may i have some self-lööps') n_nodes = FLAGS.n_nodes n_clusters = FLAGS.n_clusters train_size = FLAGS.train_size data_clean, data_dirty, labels = overlapping_gaussians(n_nodes, n_clusters) graph_clean = construct_knn_graph(data_clean).todense().A1.reshape( n_nodes, n_nodes) train_mask = np.zeros(n_nodes, dtype=np.bool) train_mask[np.random.choice(np.arange(n_nodes), int(n_nodes * train_size), replace=False)] = True test_mask = ~train_mask print(f'Data shape: {data_clean.shape}, graph shape: {graph_clean.shape}') print(f'Train size: {train_mask.sum()}, test size: {test_mask.sum()}') input_features = tf.keras.layers.Input(shape=(2, )) input_features_corrupted = tf.keras.layers.Input(shape=(2, )) input_graph = tf.keras.layers.Input((n_nodes, )) encoder = [GCN(64), GCN(32)] model = deep_graph_infomax( [input_features, input_features_corrupted, input_graph], encoder) def loss(model, x, y, training): _, y_ = model(x, training=training) return loss_object(y_true=y, y_pred=y_) def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets, training=True) for loss_internal in model.losses: loss_value += loss_internal return loss_value, tape.gradient(loss_value, model.trainable_variables) labels_dgi = tf.concat([tf.zeros([n_nodes, 1]), tf.ones([n_nodes, 1])], 0) loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True) optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate) for epoch in range(FLAGS.n_epochs): data_corrupted = data_dirty.copy() perc_shuffle = np.linspace(1, 0.05, FLAGS.n_epochs)[epoch] # perc_shuffle = 1 rows_shuffle = np.random.choice(np.arange(n_nodes), int(n_nodes * perc_shuffle)) data_corrupted_tmp = data_corrupted[rows_shuffle] np.random.shuffle(data_corrupted_tmp) data_corrupted[rows_shuffle] = data_corrupted_tmp loss_value, grads = grad(model, [data_dirty, data_corrupted, graph_clean], labels_dgi) optimizer.apply_gradients(zip(grads, model.trainable_variables)) print('epoch %d, loss: %0.4f, shuffle %0.2f%%' % (epoch, loss_value.numpy(), 100 * perc_shuffle)) representations, _ = model([data_dirty, data_corrupted, graph_clean], training=False) representations = representations.numpy() clf = LogisticRegression(solver='lbfgs', multi_class='multinomial') clf.fit(representations[train_mask], labels[train_mask]) clusters = clf.predict(representations[test_mask]) print( 'NMI:', normalized_mutual_info_score(labels[test_mask], clusters, average_method='arithmetic')) print('Accuracy:', 100 * accuracy_score(labels[test_mask], clusters))
def nmi_score(labels_true, labels_pred): return metrics.normalized_mutual_info_score(labels_true, labels_pred)
exch_mat=exch_mat, w_mat=w_mat, n_group=n_group, alpha=alpha, beta=beta, kappa=kappa, init_labels=known_labels) # Compute the groups algo_group_vec = np.argmax(result_matrix, 1) + 1 # Restrained results rstr_real_group_vec = np.delete(real_group_vec, indices_for_known_label) rstr_algo_group_vec = np.delete(algo_group_vec, indices_for_known_label) # Compute nmi score nmi_vec.append( normalized_mutual_info_score(rstr_real_group_vec, rstr_algo_group_vec)) # Writing results nmi_mean = np.mean(nmi_vec) nmi_std = np.std(nmi_vec) with open(results_file_name, "a") as output_file: output_file.write( f"{input_file},{sim_tag},{dist_option},{exch_mat_opt},{exch_range},{alpha},{beta},{kappa}," f"{known_label_ratio},{n_test},{nmi_mean},{nmi_std * 1.96 / np.sqrt(n_test)}\n" )
f.write('True Labels = ') f.write(str(labels)) f.write('Clusters Obtained = ') f.write(str(clusters.tolist())) if len(clusters) == len(labels): f.write('Clusters Obtained = ' + str(np.asarray(labels))) f.write("\nResults\n") rand_index = metrics.adjusted_rand_score(labels, clusters) rand_indexes.append(rand_index) print 'rand_index = ' + str(rand_index) f.write("Rand Index = " + str(rand_index) + "\n") NMI_index = metrics.normalized_mutual_info_score(labels, clusters) nmi_indexes.append(NMI_index) print 'NMI_index = ' + str(NMI_index) f.write("NMI Index = " + str(NMI_index) + "\n") if rep > 1: f.write("\nFINAL RESULTS\n") f.write("Avg Rand Index = " + str(float(sum(rand_indexes)) / rep) + "\n") f.write("Std Rand Index = " + str(statistics.stdev(rand_indexes)) + "\n") f.write("Avg NMI Index = " + str(float(sum(nmi_indexes)) / rep) + "\n") f.write("Std NMI Index = " + str(statistics.stdev(nmi_indexes)) + "\n") f.close()
def mutualInformationOfSymbols(s1, s2, lenP): return normalized_mutual_info_score(s1, s2)
digits = load_digits() data = scale(digits.data) n_samples, n_features = data.shape n_digits = len(np.unique(digits.target)) labels_true = digits.target sample_size = 300 # Compute clustering with MeanShift # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(data) labels_pred = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels_pred) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels_pred)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels_pred)) print("NMI: %0.3f" % metrics.normalized_mutual_info_score( labels_true, labels_pred, average_method='arithmetic'))
X = data n_digits = len(np.unique(digits.target)) labels_true = digits.target # ############################################################################# agg = AgglomerativeClustering(n_clusters=6, linkage="average") agg.fit(X) labels = agg.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) print("Normalized: %0.3f" % metrics.normalized_mutual_info_score(labels_true, labels)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) # # ############################################################################# # Plot result import matplotlib.pyplot as plt from itertools import cycle plt.figure(1) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') for k, col in zip(range(n_clusters_), colors): my_members = labels == k X = PCA(n_components=2).fit_transform(X)
from sklearn import metrics from cluster_acc import acc nmi = metrics.normalized_mutual_info_score(train_label,my_dp.z) ari = metrics.adjusted_rand_score(train_label,my_dp.z) label_uni = len(np.unique(my_dp.z)) label_true_uni = len(np.unique(train_label)) if label_uni <label_true_uni+1: acc_value = acc(my_dp.z,train_label) else: acc_value=0
train_set, valid_set, test_set = cPickle.load(f) f.close() # perform SC on the test set data_x, data_y = test_set k = 12 nClass = 500 A = kneighbors_graph(data_x, k) V = spectral_embedding(A, n_components=10, drop_first=False) V = V + numpy.absolute(numpy.min(V)) #V = V/numpy.amax(V) # km_model = KMeans(n_clusters=nClass) ypred = km_model.fit_predict(V) nmi = metrics.normalized_mutual_info_score(data_y, ypred) print('The NMI is: %.4f' % nmi) # V = numpy.float32(V) f = gzip.open('EVD-test500.pkl.gz', 'wb') cPickle.dump([(V, data_y), 0, 0], f, protocol=2) f.close() #sio.savemat('V_train_10.mat', {'train_x': V, 'train_y': data_y}) #sc = SpectralClustering(n_clusters = nClass, affinity = 'nearest_neighbors', n_neighbors = k) #sc = SpectralClustering(n_clusters = 10, affinity = 'rbf',gamma = 1, n_neighbors = 10) #data_x = data_x[0:1000] #data_y = data_y[0:1000]
def NMI(labels_real, labels_pred): nmi = metrics.normalized_mutual_info_score(labels_real, labels_pred) return nmi
del sample for i in range(epoch): print '\rprocess %d out of %d' % (i, epoch), #hdp.process_groups(groups) hdp.process_groups(random.sample(groups, batchgroup)) print '\nfinished' labels_true = [] labels_pred = [] for l, group in enumerate(groups): X = group.data.X Y = hdp.predict(X, group).tolist() labels_true.extend([l] * len(X)) labels_pred.extend(Y) return labels_true, labels_pred if __name__ == '__main__': fname = sys.argv[1] labels_true, labels_pred = hdpcluster(fname) print 'CN:\t', len(set(labels_pred)) print 'ARI:\t', metrics.adjusted_rand_score(labels_true, labels_pred) #print 'MI:\t', metrics.mutual_info_score(labels_true, labels_pred) print 'AMI:\t', metrics.adjusted_mutual_info_score(labels_true, labels_pred) print 'NMI:\t', metrics.normalized_mutual_info_score( labels_true, labels_pred) #print 'HOM:\t', metrics.homogeneity_score(labels_true, labels_pred) #print 'COM:\t', metrics.completeness_score(labels_true, labels_pred) print 'VME:\t', metrics.v_measure_score(labels_true, labels_pred)
def K_Means(): #1 global cluster global labels cluster = KMeans(n_clusters=110).fit_predict(text) print("K-Means NMI:%s" % (metrics.normalized_mutual_info_score(labels, cluster)))
p0 = int(pairElement[0].replace('(', '')) pic_new.putpixel((p1, p0), int(256 / (int(label) + 1))) predict_matrix[p0, p1] = label index += 1 pic_new.save( "../aftersegmentationimagenew/3000323mock-1000-10k5number5.jpg", "JPEG") predictList = [] for i in range(lab_arr.shape[0]): for j in range(lab_arr.shape[1]): predictList.append(predict_matrix[i][j]) regionslabel = loadlabel("../imagelabel/3000323.regions.txt") layerslabel = loadlabel("../imagelabel/3000323.layers.txt") surfaceslabel = loadlabel("../imagelabel/3000323.surfaces.txt") mockdata = getsuperpixelData("../mockimagedata/3000323mocksuperPixel.txt") pbmlabel = [] pbmlabel.append(mocklabel) result, pbmvalue = computePBM(mockdata, pbmlabel) regionslabel = normalized_mutual_info_score(regionslabel, predictList) layerslabel = normalized_mutual_info_score(layerslabel, predictList) surfaceslabel = normalized_mutual_info_score(surfaceslabel, predictList) print "pbm值为为%s" % pbmvalue print "regionslabel为%s" % regionslabel print "layerslabel为%s" % layerslabel print "surfaceslabel为%s" % surfaceslabel
# validation if batch_id % val_freq == 0: model.eval() with torch.no_grad(): nmi = 0 nmi_known = 0 for i, (_, X, y, k) in enumerate(val_loader, 1): X = X.to(device) Ap = torch.sigmoid(model(X)) kp = (lib.eigengaps(Ap).argmax(-1) + 1).cpu() Ap = Ap.cpu() plabels = torch.tensor([SpectralClustering(int(kp[j]), affinity='precomputed').fit(Ap[j]).labels_ for j in range(batch_size)]) plabels_known = torch.tensor([SpectralClustering(int(k[j]), affinity='precomputed').fit(Ap[j]).labels_ for j in range(batch_size)]) nmi = nmi + (1 / i) * (np.mean([normalized_mutual_info_score(y[j], plabels[j]) for j in range(batch_size)]) - nmi) nmi_known = nmi_known + (1 / i) * (np.mean([normalized_mutual_info_score(y[j], plabels_known[j]) for j in range(batch_size)]) - nmi_known) writer.add_scalar('Training/ValidationNMI', nmi, val_step) writer.add_scalar('Training/ValidationNMI_Known', nmi_known, val_step) val_step += 1 model.train() # checkpoint if batch_id % checkpoint_freq == 0: print('CHECKPOINT') last_checkpoint = os.path.join(CHECKPOINTDIR, '%s_%d_%09d.ckpt' % (name, epoch, batch_id)) torch.save(model.state_dict(), last_checkpoint) print('DONE') # === evaluation ===
i = 44 filename = 'data-' + str(i) + '.pkl.gz' K = (i + 1) * 4 # path = '/home/bo/Data/RCV1/Processed/' path = 'data/RCV1/Processed/' dataset = path + filename #np.random.seed(seed = 1) ## perform KM train_x, train_y = load_data(dataset) km_model = KMeans(n_clusters=K, n_init=1) results_KM = np.zeros((trials, 3)) for i in range(trials): ypred = km_model.fit_predict(train_x) nmi = metrics.normalized_mutual_info_score(train_y, ypred) ari = metrics.adjusted_rand_score(train_y, ypred) ac = acc(ypred, train_y) results_KM[i] = np.array([nmi, ari, ac]) KM_mean = np.mean(results_KM, axis=0) KM_std = np.std(results_KM, axis=0) # perform DCN # for RCV1 config_1 = { 'Init': '', 'lbd': 0.1, 'beta': 1, 'output_dir': 'RCV_results',
def NMI(true_labels, predict_labels): return normalized_mutual_info_score(true_labels, predict_labels)
def mutual_info(predict, truth): return metrics.normalized_mutual_info_score(truth, predict)
def DisKmeans(db, update_interval=None): from sklearn.cluster import KMeans from sklearn.mixture import GMM from sklearn.lda import LDA from sklearn.linear_model import LogisticRegression from sklearn.metrics import normalized_mutual_info_score from scipy.spatial.distance import cdist import cPickle from scipy.io import loadmat if db == 'mnist': N_class = 10 batch_size = 100 train_batch_size = 256 X, Y = read_db(db + '_total', True) X = np.asarray(X, dtype=np.float64) Y = np.asarray(np.squeeze(Y), dtype=np.int32) N = X.shape[0] img = np.clip((X / 0.02), 0, 255).astype(np.uint8).reshape( (N, 28, 28, 1)) elif db == 'stl': N_class = 10 batch_size = 100 train_batch_size = 256 img = read_db('stl_img', False)[0] img = img.reshape((img.shape[0], 96, 96, 3)) X, Y = read_db(db + '_total', True) X = np.asarray(X, dtype=np.float64) Y = np.asarray(np.squeeze(Y), dtype=np.int32) N = X.shape[0] elif db == 'reuters': N_class = 4 batch_size = 100 train_batch_size = 256 Y = np.fromfile('reuters.npy', dtype=np.int64) N = Y.shape[0] elif db == 'reutersidf': N_class = 4 batch_size = 100 train_batch_size = 256 Y = np.load('reutersidf.npy') N = Y.shape[0] elif db == 'reuters10k' or db == 'reutersidf10k': N_class = 4 batch_size = 100 train_batch_size = 256 X, Y = read_db(db + '_total', True) X = np.asarray(X, dtype=np.float64) Y = np.asarray(np.squeeze(Y), dtype=np.int32) N = X.shape[0] tmm_alpha = 1.0 total_iters = (N - 1) / train_batch_size + 1 if not update_interval: update_interval = total_iters Y_pred = np.zeros((Y.shape[0])) iters = 0 seek = 0 dim = 10 acc_list = [] while True: write_net(db, dim, N_class, "'{:08}'".format(0)) if iters == 0: write_db(np.zeros((N, N_class)), np.zeros((N, )), 'train_weight') ret, net = extract_feature( 'net.prototxt', 'exp/' + db + '/save_iter_100000.caffemodel', ['output'], N, True, 0) feature = ret[0].squeeze() gmm_model = TMM(N_class) gmm_model.fit(feature) net.params['loss'][0].data[0, 0, :, :] = gmm_model.cluster_centers_.T net.params['loss'][1].data[0, 0, :, :] = 1.0 / gmm_model.covars_.T else: ret, net = extract_feature('net.prototxt', 'init.caffemodel', ['output'], N, True, 0) feature = ret[0].squeeze() gmm_model.cluster_centers_ = net.params['loss'][0].data[0, 0, :, :].T Y_pred_last = Y_pred Y_pred = gmm_model.predict(feature).squeeze() acc, freq = cluster_acc(Y_pred, Y) acc_list.append(acc) nmi = normalized_mutual_info_score(Y, Y_pred) print freq print freq.sum(axis=1) print 'acc: ', acc, 'nmi: ', nmi print(Y_pred != Y_pred_last).sum() * 1.0 / N if (Y_pred != Y_pred_last).sum() < 0.001 * N: print acc_list return acc, nmi time.sleep(1) write_net(db, dim, N_class, "'{:08}'".format(seek)) weight = gmm_model.transform(feature) weight = (weight.T / weight.sum(axis=1)).T bias = (1.0 / weight.sum(axis=0)) bias = N_class * bias / bias.sum() weight = (weight**2) * bias weight = (weight.T / weight.sum(axis=1)).T print weight[:10, :] write_db(weight, np.zeros((weight.shape[0], )), 'train_weight') net.save('init.caffemodel') del net with open('solver.prototxt', 'w') as fsolver: fsolver.write("""net: "net.prototxt" base_lr: 0.01 lr_policy: "step" gamma: 0.1 stepsize: 100000 display: 10 max_iter: %d momentum: 0.9 weight_decay: 0.0000 snapshot: 100 snapshot_prefix: "exp/test/save" snapshot_after_train:true solver_mode: GPU debug_info: false sample_print: false device_id: 0""" % update_interval) os.system( 'caffe train --solver=solver.prototxt --weights=init.caffemodel') shutil.copyfile('exp/test/save_iter_%d.caffemodel' % update_interval, 'init.caffemodel') iters += 1 seek = (seek + train_batch_size * update_interval) % N