def adjusted_rand_index(): #The text file is updated by a stream of data #inputf=Streaming_AbstractGenerator.StreamAbsGen("USBWWAN_stream","USBWWAN") #inputf=Streaming_AbstractGenerator.StreamAbsGen("file","StreamingData.txt") #inputf=Streaming_AbstractGenerator.StreamAbsGen("Spark_Parquet","Spark_Streaming") #inputf=Streaming_AbstractGenerator.StreamAbsGen("AsFer_Encoded_Strings","NeuronRain") #inputf=Streaming_AbstractGenerator.StreamAbsGen("Socket_Streaming","localhost") inputf1=Streaming_AbstractGenerator.StreamAbsGen("TextHistogramPartition",["/var/log/kern.log","/var/log/syslog","/var/log/ufw.log","/var/log/dmesg","/var/log/kern.log"]) histograms=[] for p in inputf1: histograms.append(p) ari=adjusted_rand_score(tocluster(histograms[0],"Text")[:20000],tocluster(histograms[1],"Text")[:20000]) print "Adjusted Rand Index of first two histogram set partitions(truncated):",ari prev=0 for n in range(1,len(histograms)): truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9) ari=adjusted_rand_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen]) print "Adjusted Rand Index(truncated):",ari ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Text")[:truncatedlen],tocluster(histograms[n],"Text")[:truncatedlen]) print "Adjusted Mutual Info Index(truncated):",ami prev=n ################################################################# histograms=[] inputf2=Streaming_AbstractGenerator.StreamAbsGen("DictionaryHistogramPartition","Streaming_SetPartitionAnalytics.txt") for p in inputf2: histograms.append(p) prev=0 print "histograms:",histograms for n in range(1,len(histograms)): truncatedlen=int(min(len(histograms[prev]),len(histograms[n]))*0.9) ari=adjusted_rand_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen]) print "Adjusted Rand Index (truncated):",ari ami=adjusted_mutual_info_score(tocluster(histograms[prev],"Dict")[:truncatedlen],tocluster(histograms[n],"Dict")[:truncatedlen]) print "Adjusted Mutual Info Index (truncated):",ami prev=n
def static_test(): files = ['aggregation', 'compound', 'moons', 'circles'] for f in files: data = np.genfromtxt('data/' + f + '.csv', delimiter=',') pts = data[:, :2] labels = data[:, -1] labels = list(labels) # tri start = timer() tri = Tri(pts) end = timer() tri_time = end - start tri_labels = labelset_to_labels(tri.labels, len(labels)) tri_res = adjusted_rand_score(labels, tri_labels) # auto start = timer() auto = Autoclust(pts) end = timer() auto_time = end - start auto_labels = labelset_to_labels(auto.labels, len(labels)) auto_res = adjusted_rand_score(labels, auto_labels) res_dict = {'labels': labels, 'tri_label': tri_labels, 'tri_score': tri_res, 'tri_time': tri_time, 'auto_labels': auto_labels, 'auto_score': auto_res, 'auto_time': auto_time, 'name': f} with open('res', 'a') as fi: print(res_dict, file=fi)
def train(times, X, y,c, lea, ep1, ep2, lamda1, lamda2 ): t0 = time.time() # times = 1 # for lea in [0.0001, 0.00001, 0.000001]: # lea = .00001 print 'learn={}, ep1={}, ep2={}, la1={}, la2={}'.format(lea, ep1, ep2, lamda1, lamda2) ari,ri,accu = [], [], [] for ddd in range(times): y_pred_old = sof(X, y, k=len(np.unique(y)), c=1, lamda1=lamda1,lamda2=lamda2, mu=2, gamma=lea, ep1=ep1, ep2=ep2 ) row, col = linear_sum_assignment(-confusion_matrix(y, y_pred_old)) y_pred = np.copy(y_pred_old) for i, q in enumerate(col): y_pred[y_pred_old==q] = i ari.append( adjusted_rand_score(y,y_pred) ) ri.append(rand_score(y, y_pred)) accu.append(accuracy_score(y,y_pred)) print '\taccu={}, RI={}'.format(accuracy_score(y,y_pred),rand_score(y, y_pred)) # print 'ARI: ', adjusted_rand_score(y,y_pred) # print 'RI: ', rand_score(y, y_pred) # print 'Accu: ', accuracy_score(y,y_pred) print confusion_matrix(y, y_pred) # print y_pred print 'time, ', time.time()-t0 print 'title\tmax\tmean\tstd' print 'ARI, ', np.array(ari).max(), np.array(ari).mean(), np.array(ari).std() print 'RI, ', np.array(ri).max(), np.array(ri).mean(), np.array(ri).std() print 'Accu, ', np.array(accu).max(), np.array(accu).mean(), np.array(accu).std() print ''
def run_fkmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll): params = { 'newsgroup': { 'k': [20], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'ig': { 'k': [13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'igtoy': { 'k': [3], 'l': [2, 3, 4, 5, 6], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'nips': { 'k': [9], 'l': [5, 7, 9, 11, 13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] } } output_file = codecs.open(dataset_name + '_fuzzy_cmeans_news_results.csv', 'w', 'utf-8') output_file.write('X,K,NMI,RAND,DAVIES\n') output_file.flush() for k in params[dataset_name]['k']: for data_str in params[dataset_name]['X']: data = eval(data_str) data = data.toarray().astype(np.float64) error_best = np.inf for _ in range(10): tick1 = time.time() centroids, U, _, _, errors, _, _ = fuzz.cluster.cmeans( data.T, k, 2, error=0.00000000001, maxiter=10000) tick2 = time.time() print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'fkmeans')) labels_pred = np.argmax(U, axis=0) error = errors[-1] nmi_score = normalized_mutual_info_score(labels_true, labels_pred) rand_score = adjusted_rand_score(labels_true, labels_pred) davies_score = davies_bouldin_score(data, labels_pred, centroids) tick3 = time.time() print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'fkmeans')) output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score)) output_file.flush() print('Execution: X: {}, k: {}'.format(data_str, k)) print('NMI score: {}'.format(nmi_score)) print('Rand score: {}'.format(rand_score)) print('Davies score: {}'.format(davies_score)) print('-----------------------------------------------\n') output_file.close()
def test_non_consicutive_labels(): # regression tests for labels with gaps h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ari_1, 0.24, 2) assert_almost_equal(ari_2, 0.24, 2)
def assert_fit_predict_correct(model, X): model2 = copy.deepcopy(model) predictions_1 = model.fit(X).predict(X) predictions_2 = model2.fit_predict(X) assert adjusted_rand_score(predictions_1, predictions_2) == 1.0
def test_bayesian_mixture_predict_predict_proba(): # this is the same test as test_gaussian_mixture_predict_predict_proba() rng = np.random.RandomState(0) rand_data = RandomData(rng) for prior_type in PRIOR_TYPE: for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y bgmm = BayesianGaussianMixture( n_components=rand_data.n_components, random_state=rng, weight_concentration_prior_type=prior_type, covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This BayesianGaussianMixture instance" " is not fitted yet. Call 'fit' with " "appropriate arguments before using " "this method.", bgmm.predict, X) bgmm.fit(X) Y_pred = bgmm.predict(X) Y_pred_proba = bgmm.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert_greater_equal(adjusted_rand_score(Y, Y_pred), .95)
def run_kmeans(X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf, labels_true, dataset_name, kk, ll): params = { 'newsgroup': { 'k': [10, 15, 20, 25, 30], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'ig': { 'k': [13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'igtoy': { 'k': [3], 'l': [2, 3, 4, 5, 6], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] }, 'nips': { 'k': [9], 'l': [5, 7, 9, 11, 13], 'X': ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf'] } } output_file = codecs.open(dataset_name + '_kmeans_news_results.csv', 'w', 'utf-8') output_file.write('X,K,NMI,RAND,DAVIES\n') for k in params[dataset_name]['k']: for data_str in params[dataset_name]['X']: data = eval(data_str) data = data.toarray().astype(np.float64) error_best = np.inf for _ in range(10): tick1 = time.time() datat = data.T # n, _ = data.shape # temp = np.diag(np.squeeze(np.asarray((data.dot(datat).dot(np.ones(n).reshape(n, 1)))))) # d = datat.dot(np.sqrt(temp)) estimator = KMeans(n_clusters=k, max_iter=10000) estimator.fit(data) tick2 = time.time() print(u'Took {} secs to train the {} model...'.format((tick2 - tick1), 'kmeans')) labels_pred = estimator.labels_ centroids = estimator.cluster_centers_ error = estimator.inertia_ nmi_score = normalized_mutual_info_score(labels_true, labels_pred) rand_score = adjusted_rand_score(labels_true, labels_pred) davies_score = davies_bouldin_score(data, labels_pred, centroids) tick3 = time.time() print(u'Took {} secs to calculate {} metrics...'.format((tick3 - tick2), 'kmeans')) output_file.write(u'{},{},{},{},{}\n'.format(data_str, k, nmi_score, rand_score, davies_score)) print('Execution: X: {}, k: {}'.format(data_str, k)) print('NMI score: {}'.format(nmi_score)) print('Rand score: {}'.format(rand_score)) print('Davies score: {}'.format(davies_score)) print('-----------------------------------------------\n') output_file.close()
def test_nn_classifier(self): blob_graphs, expected = self._make_blob_graphs(k=4) partial = expected.copy() partial[1:-1] = -1 for g in blob_graphs: labels = g.classify_nearest(partial) self.assertGreater(adjusted_rand_score(expected, labels), 0.95)
def test_harmonic_classifier(self): blob_graphs, expected = self._make_blob_graphs(k=4) partial = expected.copy() partial[1:-1] = -1 for g in blob_graphs: labels = g.classify_harmonic(partial, use_CMN=True) self.assertGreater(adjusted_rand_score(expected, labels), 0.95)
def calculate_scores(self): x, c, labels = self.x, self.c, self.labels self.v_measure = v_measure_score(c, labels) self.complete = completeness_score(c, labels) self.adjusted_mutual = adjusted_mutual_info_score(c, labels) self.adjusted_rand = adjusted_rand_score(c, labels) self.silhouette = silhouette_score(x, c) self.purity, self.partial_purity = self.__purity__()
def test_lgc_classifier(self): blob_graphs, expected = self._make_blob_graphs(k=11) partial = expected.copy() partial[1:-1] = -1 for g in blob_graphs: labels = g.classify_lgc(partial, kernel='rbf', alpha=0.2, tol=1e-3, max_iter=30) self.assertGreater(adjusted_rand_score(expected, labels), 0.95)
def __adjusted_rand_index(generated): # generate expected assignment array. expected = [] for x in range(4): for y in range(5): expected.append(x) predicted = [x for x in generated.itervalues()] pprint(predicted) pprint(expected) return adjusted_rand_score(expected, predicted)
def test_unsupervised_scores(): # test clustering where there is some true y. # We don't have any real unsupervised SCORERS yet X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) score1 = SCORERS['ari'](km, X_test, y_test) score2 = adjusted_rand_score(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def test_unsupervised_scorers(): """Test clustering scorers against gold standard labeling.""" # We don't have any real unsupervised Scorers yet. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) score1 = SCORERS['adjusted_rand_score'](km, X_test, y_test) score2 = adjusted_rand_score(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def calc(center): for _ in range(333): for n_sample in range(100, 501, 100): for n_feature in range(2, 5): #for center in range(2, 10): seed = np.random.randint(0, 10000) pts, labels = datasets.make_blobs(n_samples=n_sample, n_features=n_feature, cluster_std=0.5, centers=center, random_state=seed) labels = list(labels) tri = Tri(pts) #tri_res = compare_labels(labels, tri.labels) tri_labels = labelset_to_labels(tri.labels, n_sample) tri_res = adjusted_rand_score(labels, tri_labels) auto = Autoclust(pts) #auto_res = compare_labels(labels, auto.labels) auto_labels = labelset_to_labels(auto.labels, n_sample) auto_res = adjusted_rand_score(labels, auto_labels) res_dict = {'labels': labels, 'tri_label': tri_labels, 'tri_score': tri_res, 'auto_labels': auto_labels, 'auto_score': auto_res, 'seed': seed} with open('S' + str(n_sample) + 'F' + str(n_feature) + 'C' + str(center), 'a') as f: print(res_dict, file=f)
def evaluate( self, partition, clustered_ids ): # no class info? if not self.has_class_info(): return {} # get two clusterings that we can compare n = len(clustered_ids) classes_subset = np.zeros( n ) for row in range(n): classes_subset[row] = self.class_map[clustered_ids[row]] scores = {} scores["external-nmi"] = normalized_mutual_info_score( classes_subset, partition ) scores["external-ami"] = adjusted_mutual_info_score( classes_subset, partition ) scores["external-ari"] = adjusted_rand_score( classes_subset, partition ) return scores
def Rand_index_cal(infile, infile2, prefix): """function to calcutae the rand index between clustering programs/ Call other functions to open parse the file, and return a list of results. requires: import sklearn from sklearn.metrics.cluster + import adjusted_rand_score""" cluster_list = prepare_rand_list(infile) cluster_list2 = prepare_rand_list(infile2) rant_result = adjusted_rand_score(cluster_list, cluster_list2) result = ("%s\tadjusted_rand_score =\t%f\n" % (prefix, rant_result)) return result
def sklearn_measures(U, V): # http://scikit-learn.org/stable/modules/classes.html#clustering-metrics import sklearn.metrics.cluster as sym U_labels = np.nonzero(U)[1] V_labels = np.nonzero(V)[1] print U_labels, V_labels # V2_labels = np.nonzero(V2)[1] print 'entro(U)=',sym.entropy(U_labels),'entro(V)=',sym.entropy(V_labels), 'entro(U,V)=',sym.mutual_info_score(U_labels, V_labels) res = [ ['ari', 'nmi', 'ami', 'vm' ], \ [ sym.adjusted_rand_score(U_labels, V_labels),\ sym.normalized_mutual_info_score(U_labels, V_labels),\ sym.adjusted_mutual_info_score(U_labels, V_labels),\ sym.v_measure_score(U_labels, V_labels)]] print res return res
def evaluate(input_matrix, eigen_order): _, pred_cluster_labels = predict_cluster_labels( input_matrix, k, eigen_order) true_cluster_labels = [j for i in range(group_number) for j in repeat(i, group_size)] # print('true_cluster_labels:') # print(true_cluster_labels) # print('pred_cluster_labels:') # print(pred_cluster_labels) arc = adjusted_rand_score(true_cluster_labels, pred_cluster_labels) # partition-based sign prediction pred_sign_mat = predict_signs_via_partition(pred_cluster_labels) p_acc = np.count_nonzero(true_Q == pred_sign_mat) / (N * N) return arc, p_acc
def main(): ttt_data = loadData('tic-tac-toe.data') training_set = [row[:9] for row in ttt_data] #print(training_set) dmatrix = Dissimilarity(training_set).calculate_dmatrix() # with open('ttt_dmatrix.csv', 'w', newline='') as csvfile: # spamwriter = csv.writer(csvfile, # delimiter=',', # quotechar='|', # quoting=csv.QUOTE_MINIMAL) # # spamwriter.writerow(['x'+str(i) for i in range(len(dmatrix))]) # for line in dmatrix: # spamwriter.writerow(line) results = [] sfcmdd = SFCMdd(training_set,dmatrix) for i in range(100): U,G,J = sfcmdd.compute(K=2,T=150,emax=(10.e-10),m=2,q=2) success=0 fail=0 for y,n in U[:626]: if y < n: fail+=1 else: success+=1 for y,n in U[626:]: if n < y: fail+=1 else: success+=1 #print("RESULTS: \n>>>>> sucess: "+str(success)+"\n>>>>> fail: "+str(fail)) print("Classification Rate: "+str(success/958.0)) results.append([J,U,(success/958.0)]) results.sort(key=lambda tup: tup[0]) for i in results[:10]: print("J: "+results[0]) print("Rate: "+results[2]) fuzzy_partition, prototypes, best_rate = results[0] hard_partition = [e for e in hard_partition_generator(fuzzy_partition)] ars = adjusted_rand_score( [e1 for e1,e2 in hard_partition], [e2 for e1,e2 in hard_partition] ) write_csv_partition(fuzzy_partition, "fuzzy_k_medoids_result.csv") write_csv_partition(hard_partition, "hard_partition.csv") write_csv_partition([ars], "adjusted_rand_score.csv")
def test_gaussian_mixture_fit_predict(): rng = np.random.RandomState(0) rand_data = RandomData(rng) for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y g = GaussianMixture(n_components=rand_data.n_components, random_state=rng, weights_init=rand_data.weights, means_init=rand_data.means, precisions_init=rand_data.precisions[covar_type], covariance_type=covar_type) # check if fit_predict(X) is equivalent to fit(X).predict(X) f = copy.deepcopy(g) Y_pred1 = f.fit(X).predict(X) Y_pred2 = g.fit_predict(X) assert_array_equal(Y_pred1, Y_pred2) assert_greater(adjusted_rand_score(Y, Y_pred2), .95)
def evaluation(X_selected, n_clusters, y): """ This function calculates ARI, ACC and NMI of clustering results Input ----- X_selected: {numpy array}, shape (n_samples, n_selected_features} input data on the selected features n_clusters: {int} number of clusters y: {numpy array}, shape (n_samples,) true labels Output ------ ari: {float} Adjusted Rand Index nmi: {float} Normalized Mutual Information acc: {float} Accuracy """ k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) k_means.fit(X_selected) y_predict = k_means.labels_ # calculate ARI ari = adjusted_rand_score(y, y_predict) # calculate NMI nmi = normalized_mutual_info_score(y, y_predict) # calculate ACC y_permuted_predict = best_map(y, y_predict) acc = accuracy_score(y, y_permuted_predict) return ari, nmi, acc
def process_evaluation(args, model): if args['true_row_labels']: try: with open(args['true_row_labels'], 'r') as f: labels = f.read().split() from sklearn.metrics.cluster import normalized_mutual_info_score from sklearn.metrics.cluster import adjusted_rand_score from sklearn.metrics import confusion_matrix n = normalized_mutual_info_score(labels, model.row_labels_) ari = adjusted_rand_score(labels, model.row_labels_) cm = confusion_matrix(labels, model.row_labels_) print("nmi ==>" + str(n)) print("adjusted rand index ==>" + str(ari)) print() print(cm) except Exception as e: logging.error("--true_row_labels option (evaluation) exception:\ %s" % e)
def test_gaussian_mixture_predict_predict_proba(): rng = np.random.RandomState(0) rand_data = RandomData(rng) for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y g = GaussianMixture(n_components=rand_data.n_components, random_state=rng, weights_init=rand_data.weights, means_init=rand_data.means, precisions_init=rand_data.precisions[covar_type], covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This GaussianMixture instance is not fitted " "yet. Call 'fit' with appropriate arguments " "before using this method.", g.predict, X) g.fit(X) Y_pred = g.predict(X) Y_pred_proba = g.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert_greater(adjusted_rand_score(Y, Y_pred), .95)
def main(_): ed.set_seed(42) # DATA X_data, Z_true = karate("~/data") N = X_data.shape[0] # number of vertices K = 2 # number of clusters # MODEL gamma = Dirichlet(concentration=tf.ones([K])) Pi = Beta(concentration0=tf.ones([K, K]), concentration1=tf.ones([K, K])) Z = Multinomial(total_count=1.0, probs=gamma, sample_shape=N) X = Bernoulli(probs=tf.matmul(Z, tf.matmul(Pi, tf.transpose(Z)))) # INFERENCE (EM algorithm) qgamma = PointMass(tf.nn.softmax(tf.get_variable("qgamma/params", [K]))) qPi = PointMass(tf.nn.sigmoid(tf.get_variable("qPi/params", [K, K]))) qZ = PointMass(tf.nn.softmax(tf.get_variable("qZ/params", [N, K]))) inference = ed.MAP({gamma: qgamma, Pi: qPi, Z: qZ}, data={X: X_data}) inference.initialize(n_iter=250) tf.global_variables_initializer().run() for _ in range(inference.n_iter): info_dict = inference.update() inference.print_progress(info_dict) # CRITICISM Z_pred = qZ.mean().eval().argmax(axis=1) print("Result (label flip can happen):") print("Predicted") print(Z_pred) print("True") print(Z_true) print("Adjusted Rand Index =", adjusted_rand_score(Z_pred, Z_true))
aClasses = [(l.strip()) for l in (open("./classes.csv").readlines())] maxA = -10 aClassesInt = list() # check every classes's codification quality for a in lk: aClassesInt.clear() for c in aClasses: if c == 'Agents': aClassesInt.append(a[0]) elif c == 'IR': aClassesInt.append(a[1]) elif c == 'DB': aClassesInt.append(a[2]) elif c == 'AI': aClassesInt.append(a[3]) elif c == 'HCI': aClassesInt.append(a[4]) elif c == 'ML': aClassesInt.append(a[5]) else: print("Wrong argument data in classes.csv file") tmpA = adjusted_rand_score(finalClasses, aClassesInt) # store the best classes's codification quality if tmpA > maxA: maxA = tmpA maxComb = a bestAClassesInt = aClassesInt # print results print("Best classes's codification -> Cluster quality") print(str(maxComb) + " -> " + str(maxA)) print()
set([i for (i, j) in enumerate(louvain) if j == l])) print("Louvain Modularity:", nx.algorithms.community.modularity(graph, communities)) print() # AMI LPA_AMI = sk.adjusted_mutual_info_score(cluster, lpa) Louvain_AMI = sk.adjusted_mutual_info_score(cluster, louvain) print("LPA AMI:", LPA_AMI) print("Louvain AMI:", Louvain_AMI) print() # RI LPA_RI = sk.adjusted_rand_score(cluster, lpa) Louvain_RI = sk.adjusted_rand_score(cluster, louvain) print("LPA RI:", LPA_RI) print("Louvain RI:", Louvain_RI) print() # NMI LPA_NMI = sk.normalized_mutual_info_score(cluster, lpa) Louvain_NMI = sk.normalized_mutual_info_score(cluster, louvain) print("LPA NMI:", LPA_NMI) print("Louvain NMI:", Louvain_NMI) # divisive approach for i in range(len(graphs_div)): graph_file = graphs_div[i]
permuted_list = list() for perm in range(perm_num): for i in range(AllDataMatrix_temp.shape[1]): np.random.shuffle(AllDataMatrix_temp[:,i]) permuted_list.append(AllDataMatrix_temp) try: if "group" in SIMULATION_TYPES: results_group = Parallel(n_jobs = machine_cores_to_use)(delayed(get_gap_one_s_group)(i,AllDataMatrix = AllDataMatrix,permuted_list = permuted_list) for i in zip(s_list,[nclust]*len(s_list),[multi]*len(s_list))) best_sparse_kmeans_group,lgroup,wgroup = sparse_kmeans(AllDataMatrix = AllDataMatrix, nclust = nclust, s = s_list[np.argmax(results_group)], niter=niter,group = True, multi = multi) print(s_list[np.argmax(results_group)]) print(adjusted_rand_score(labels,lgroup)) sparse_group_res[sim,:] = adjusted_rand_score(labels,lgroup) path = path_to_save_files +"GROUP_n=" +str(n)+ "sigma=" +str(sigma) + "signal=" + SIGNAL_TYPE +".txt" np.savetxt(path, sparse_group_res) if "sparse" in SIMULATION_TYPES: results = R_sparse_kmeans(data = numpy2ri(AllDataMatrix),nclust = nclust,nperms = perm_num, s = -1) l = np.array(results[0]) print adjusted_rand_score(labels,l) sparse_res[sim,:] = adjusted_rand_score(labels,l) path = path_to_save_files +"sparse_n=" +str(n)+ "sigma=" +str(sigma) + "signal=" + SIGNAL_TYPE +".txt" np.savetxt(path, sparse_res)
#plot sorted list #checking the clustering using ARI TrueClusters = [] for i in range (seqOBJArr.__len__()): TrueClusters.append(seqOBJArr[i].trueCluster) AssignedClusters = [] for i in range (seqOBJArr.__len__()): AssignedClusters.append(seqOBJArr[i].currentCluster) print("True Clusters: ", TrueClusters) print("Assgined Clusters: ", AssignedClusters) #check ARI print(adjusted_rand_score(TrueClusters, AssignedClusters)) ''' alphaArr = [0.01, 0.1,0.2,0.5,0.75,1.0,2.0] ariArr = [0.56206, 0.96054, 0.96053, 0.82823, 0.33733, 0.211045, 0.07455] plt.plot(alphaArr, ariArr) plt.show() ''' #xhecking with leelu m actual ans = [] q = open("/Users/mallika/PycharmProjects/DirichletBio/venv/lib/actual.txt", "r") for line in q: values=line.split() if (values[0]=="2"): ans.append(2)
X = [] y = [] for line in file.readlines(): curLine = line.strip().split(", ") X.append([float(i) for i in curLine[0:-1]]) y.append(curLine[-1].strip('.')) # iterate over classifiers------------------------------------------- glass_score = [] params = range(1, 19, 1) for param in params: algorithm = KMeans(n_clusters=param) algorithm.fit(X) y_pred = algorithm.predict(X) s = adjusted_rand_score(y, y_pred) glass_score.append(s) print('glass_score', glass_score) # draw score pic--------------------------------------- plt.figure(figsize=(6, 4), dpi=120) plt.grid() plt.xlabel('n_clusters for KMeans') plt.xticks(params) plt.plot(params, glass_score, label='glass_score', color='g') plt.legend() plt.title("glass KMeans score") plt.savefig("img/KMeans.png") plt.show()
def main(): # Sorting out arguments. if len(sys.argv) != 4: print('Usage: %s symptoms/herbs/both top/section/subsection' ' similarity_threshold' % sys.argv[0]) exit() vector_type = sys.argv[1] assert vector_type in ['symptoms', 'herbs', 'both'] label_type = sys.argv[2] assert label_type in ['top', 'section', 'subsection'] # linkage = sys.argv[3] # assert linkage in ['average', 'complete'] # # 'full' to use all herbs and symptoms. 'partial' to use only dictionary. # abridged = (sys.argv[4] == 'partial') similarity_threshold = float(sys.argv[3]) feature_list, master_patient_dct = get_master_patient_dct(vector_type, False) # Get the patient by attribute matrix. (attribute_by_patient_matrix, section_labels, subsection_labels, file_num_labels) = get_attribute_by_patient_matrix(feature_list, master_patient_dct) # Picking the type of labels. if label_type == 'top': true_labels = get_label_to_index_conversions(file_num_labels) elif label_type == 'section': true_labels = get_label_to_index_conversions(section_labels) elif label_type == 'subsection': true_labels = get_label_to_index_conversions(subsection_labels) num_clusters = len(set(true_labels)) # Uncomment this block if making changes to similarity matrix. similarity_matrix = get_similarity_matrix(feature_list, similarity_threshold) # similarity_matrix = get_top_k_elements_per_row_sim_mat( # similarity_matrix, top_k) embedded_matrix = similarity_matrix * attribute_by_patient_matrix # embedded_matrix = upper_bound_matrix(np.array(embedded_matrix)) # np.savetxt('./results/embedded_%s_matrix.txt' % vector_type, # embedded_matrix) # exit() # embedded_matrix = np.loadtxt('./results/embedded_%s_matrix.txt' % ( # vector_type)) # Get the list of entropies for the embedded matrix. entropy_list = np.apply_along_axis(entropy, axis=1, arr=embedded_matrix) # Delete the percentage% lowest entropy elements. for percentage in [p / 20.0 for p in range(20)]: num_att_to_delete = int(len(feature_list) * percentage) # Deleting lowest entropy attributes. att_indices_to_delete = entropy_list.argsort()[:num_att_to_delete] # First, copy the attribute by patient matrix. feature_vectors = np.copy(embedded_matrix) # Delete the lowest entropy attributes, and transpose. feature_vectors = np.delete(feature_vectors, att_indices_to_delete, axis=0).T # random_state = 5191993 # y_pred = KMeans(n_clusters=num_clusters, # random_state=random_state).fit_predict(feature_vectors) # print 'k-means %g' % (adjusted_rand_score(true_labels, y_pred)) # y_pred = SpectralClustering(n_clusters=num_clusters, # eigen_solver='arpack', random_state=random_state, # affinity="cosine").fit_predict(feature_vectors) # print 'spectral %g' % (adjusted_rand_score(true_labels, y_pred)) y_pred = AgglomerativeClustering(n_clusters=num_clusters, affinity='cosine', linkage='average').fit_predict( feature_vectors) # cluster_dct = {} # for i, cluster_label in enumerate(y_pred): # section_label = section_labels[i] # subsection_label = subsection_labels[i] # patient = (section_label, subsection_label) # if cluster_label in cluster_dct: # cluster_dct[cluster_label] += [patient] # else: # cluster_dct[cluster_label] = [patient] # out = open('./results/embedding_patient_clusters.txt', 'w') # for cluster_label in cluster_dct: # patient_cluster = cluster_dct[cluster_label] # for section_label, subsection_label in patient_cluster: # out.write('%s,%s\t' % (section_label, subsection_label)) # out.write('\n') # out.close() rand_index = adjusted_rand_score(true_labels, y_pred) # if rand_index >= 0.292420: print rand_index, percentage
#print("Actual"); #print(np.asarray(phi)); #print(membership_act); #print("predicted") #print(pi_pred); #print(qgamma.mean().eval()); X_pred = np.array(X.mean().eval() > 0.5, dtype=int); cnt = N*N; correct = np.sum(X_data == X_pred); plt.subplot(211); plt.imshow(X_data, cmap='Greys'); plt.subplot(212) plt.imshow(X_pred, cmap='Greys'); plt.show(); print("Correctly predicted: ", correct); print("Total entries: ", cnt); print("Train Accuracy: ", correct/cnt); print("Result (label flip can happen):") print("Predicted") print(Z_pred) print("True") print(Z_true) print("Adjusted Rand Index =", adjusted_rand_score(Z_pred, Z_true))
def ecac_run(X, n_clusters, data, pop_size=20, max_gens=2000, p_crossover=0.95, p_mutation=0.98, runs=10, y=None, log_file=False, evolutionary_plot=False): tifont = { 'fontname': 'Times New Roman', 'fontsize': 20, 'fontweight': 'bold' } axfont = {'fontname': 'Times New Roman', 'fontsize': 16} for run in range(runs): print('============= TEST {} ============='.format(run + 1)) print('Clustering started using ECAC'.format(data)) print('Dataset: {}, Clusters: {}, Instances: {}, Features: {}'.format( data, n_clusters, len(X), len(X[0]))) print('Population size: {}, Generations: {}'.format( pop_size, max_gens)) start = time.time() population = [] fit_log = [] X = StandardScaler().fit_transform(X) print('Generating initial population') for _ in range(pop_size): individual = {'partition': random_gen(n_clusters, X)} individual['fitness'] = fitness_value(X, individual['partition'], n_clusters) if individual not in population: population.append(individual) best = sorted(population, key=lambda k: k['fitness'], reverse=True)[0] print('Starting genetic process...') for i in range(max_gens): print('Generation {}'.format(i + 1)) selected = [] for _ in range(pop_size): selected.append(binary_tournament(population)) children = reproduce(selected, pop_size, p_crossover, p_mutation, n_clusters) for j in range(len(children)): children[j]['fitness'] = fitness_value( X, children[j]['partition'], n_clusters) children.sort(key=lambda l: l['fitness'], reverse=True) if children[0]['fitness'] >= best['fitness']: best = children[0] population = children if log_file: fit_log.append((i + 1, best['fitness'])) if evolutionary_plot: plt.figure(figsize=(12, 8), dpi=200) plt.title('ECAC - Generation {}'.format(i + 1), **tifont) plt.xlabel('Column 1', **axfont) plt.ylabel('Column 2', **axfont) colors = best['partition'] plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolor='k', cmap='YlGnBu') plt.tight_layout() if not os.path.exists('figures/{}/{}'.format(data, run + 1)): os.makedirs('figures/{}/{}'.format(data, run + 1)) plt.savefig('figures/{}/{}/scatter_{}.jpg'.format( data, run + 1, i + 1), format='jpg') if best['fitness'] == 1: break run_time = time.time() - start best['time'] = run_time print('Optimization finished in {:.2f}s with an objective of {:.4f}'. format(best['time'], best['fitness'])) best['partition'] = np.array(best['partition']) d = dict() d['Dataset'] = data d['Algorithm'] = 'ecac' d['Clusters'] = n_clusters d['Instances'] = len(X) d['Features'] = len(X[0]) d['Pop. size'] = pop_size d['Max. gens'] = max_gens d['No. objectives'] = 1 d['Obj. 1 name'] = 'generalization' d['Objective 1'] = best['fitness'] d['Obj. 2 name'] = np.nan d['Objective 2'] = np.nan d['Time'] = best['time'] if y is None: d['Adjusted Rand Index'] = np.nan print('No labels provided') else: adj_rand_index = adjusted_rand_score(y, best['partition']) d['Adjusted Rand Index'] = adj_rand_index print('Adjusted RAND index: {:.4f}'.format()) for i in range(len(best['partition'])): d['X{}'.format(i + 1)] = '{}'.format(best['partition'][i]) out = pd.DataFrame(d, index=[data]) if not os.path.exists('ecac-out/{}_{}_{}_{}'.format( data, n_clusters, pop_size, max_gens)): os.makedirs('ecac-out/{}_{}_{}_{}'.format(data, n_clusters, pop_size, max_gens)) out.to_csv('ecac-out/{}_{}_{}_{}/solution-{}_{}_{}_{}-{}.csv'.format( data, n_clusters, pop_size, max_gens, data, n_clusters, pop_size, max_gens, run + 1), index=False) if log_file: log = pd.DataFrame(fit_log, columns=['gen', 'fitness']) log.to_csv('ecac-out/{}_{}_{}_{}/log-{}_{}_{}_{}-{}.csv'.format( data, n_clusters, pop_size, max_gens, data, n_clusters, pop_size, max_gens, run + 1), index=False) filenames = glob.glob("ecac-out/{}_{}_{}_{}/solution*".format( data, n_clusters, pop_size, max_gens)) df = pd.DataFrame() for name in filenames: temp_df = pd.read_csv(name) df = df.append(temp_df) df.reset_index(drop=True, inplace=True) df.to_csv('ecac-out/solutions-{}_{}_{}_{}-{}.csv'.format( data, n_clusters, pop_size, max_gens, runs))
def compute_stability_fold(samples, train, test, method='ward', max_k=None, stack=False, stability=True, cv_likelihood=False, corr_score=None, ground_truth=None, n_neighbors=1, **kwargs): """ General function to compute the stability on a cross-validation fold. Parameters: ----------- samples : list of arrays List of arrays containing the samples to cluster, each array has shape (n_samples, n_features) in PyMVPA terminology. We are clustering the features, i.e., the nodes. train : list or array Indices for the training set. test : list or array Indices for the test set. method : {'complete', 'gmm', 'kmeans', 'ward'} Clustering method to use. Default is 'ward'. max_k : int or None Maximum k to compute the stability testing, starting from 2. By default it will compute up to the maximum possible k, i.e., the number of points. stack : bool Whether to stack or average the datasets. Default is False, meaning that the datasets are averaged by default. stability : bool Whether to compute the stability measure described in Lange et al., 2004. Default is True. cv_likelihood : bool Whether to compute the cross-validated likelihood for mixture model; only valid if 'gmm' method is used. Default is False. corr_score : {'pearson','spearman'} or None Whether to compute the specified type of correlation score. Default is None. ground_truth : array or None Array containing the ground truth of the clustering of the data, useful to compare stability against ground truth for simulations. n_neighbors : int Number of neighbors to use to predict clustering solution on test set using K-nearest neighbors. Currently used only for methods `complete` and `ward`. Default is 1. kwargs : optional Keyword arguments being passed to the clustering method (only for 'ward' and 'gmm'). Returns: -------- ks : array A (max_k-1,) array, where ks[i] is the `k` of the clustering solution for iteration `i`. ari : array A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. ami : array A (max_k-1,) array, where ari[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. stab : array or None A (max_k-1,) array, where stab[i] is the stability measure described in Lange et al., 2004 for `k` of ks[i]. Note that this measure is the un-normalized one. It will be normalized later in the process. likelihood : array or None If method is 'gmm' and cv_likelihood is True, a (max_k-1,) array, where likelihood[i] is the cross-validated likelihood of the GMM clustering solution for `k` of ks[i]. Otherwise returns None. ari_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ari_gt[i] is the Adjusted Rand Index of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. ami_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ami_gt[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. stab_gt : array or None If ground_truth is not None, a (max_k-1,) array, where stab_gt[i] is the stability measure of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. corr : array or None Average correlation for each fold. TODO corr_gt : array or None Avg correlation against GT. TODO """ if method not in AVAILABLE_METHODS: raise ValueError('Method {0} not implemented'.format(method)) if cv_likelihood and method != 'gmm': raise ValueError( "Cross-validated likelihood is only available for 'gmm' method") # if max_k is None, set max_k to maximum value if not max_k: max_k = samples[0].shape[1] # preallocate arrays for results ks = np.zeros(max_k-1, dtype=int) ari = np.zeros(max_k-1) ami = np.zeros(max_k-1) if stability: stab = np.zeros(max_k-1) if cv_likelihood: likelihood = np.zeros(max_k-1) if corr_score is not None: corr = np.zeros(max_k-1) if ground_truth is not None: ari_gt = np.zeros(max_k-1) ami_gt = np.zeros(max_k-1) if stability: stab_gt = np.zeros(max_k-1) if corr_score is not None: corr_gt = np.zeros(max_k-1) # get training and test train_set = [samples[x] for x in train] test_set = [samples[x] for x in test] if stack: train_ds = np.vstack(train_set) test_ds = np.vstack(test_set) else: train_ds = np.mean(np.dstack(train_set), axis=2) test_ds = np.mean(np.dstack(test_set), axis=2) # compute clustering on training set if method == 'complete': train_ds_dist = pdist(train_ds.T, metric='correlation') test_ds_dist = pdist(test_ds.T, metric='correlation') # I'm computing the full tree and then cutting # afterwards to speed computation Y_train = complete(train_ds_dist) # same on testing set Y_test = complete(test_ds_dist) elif method == 'ward': (children_train, n_comp_train, n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs) # same on testing set (children_test, n_comp_test, n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs) elif method == 'gmm' or method == 'kmeans': pass # we'll have to run it for each k else: raise ValueError("We shouldn't get here") for i_k, k in enumerate(range(2, max_k+1)): if method == 'complete': # cut the tree with right K for both train and test train_label = cut_tree_scipy(Y_train, k) test_label = cut_tree_scipy(Y_test, k) # train a classifier on this clustering knn = KNeighborsClassifier(#algorithm='brute', # metric='correlation', n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'ward': # cut the tree with right K for both train and test train_label = _hc_cut(k, children_train, n_leaves_train) test_label = _hc_cut(k, children_test, n_leaves_test) # train a classifier on this clustering knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'gmm': gmm = GMM(n_components=k, **kwargs) # fit on train and predict test gmm.fit(train_ds.T) prediction_label = gmm.predict(test_ds.T) if cv_likelihood: log_prob = np.sum(gmm.score(test_ds.T)) # fit on test and get labels gmm.fit(test_ds.T) test_label = gmm.predict(test_ds.T) elif method == 'kmeans': kmeans = KMeans(n_clusters=k) # fit on train and predict test kmeans.fit(train_ds.T) prediction_label = kmeans.predict(test_ds.T) # fit on test and get labels kmeans.fit(test_ds.T) test_label = kmeans.predict(test_ds.T) else: raise ValueError("We shouldn't get here") # append results ks[i_k] = k ari[i_k] = adjusted_rand_score(prediction_label, test_label) ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label) if stability: stab[i_k] = stability_score(prediction_label, test_label, k) if cv_likelihood: likelihood[i_k] = log_prob if corr_score is not None: corr[i_k] = correlation_score(prediction_label, test_label, test_ds, corr_score) if ground_truth is not None: ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth) ami_gt[i_k] = adjusted_mutual_info_score(prediction_label, ground_truth) if stability: stab_gt[i_k] = stability_score(prediction_label, ground_truth, k) if corr_score is not None: corr_gt[i_k] = correlation_score(prediction_label, ground_truth, test_ds, corr_score) results = [ks, ari, ami] if stability: results.append(stab) else: results.append(None) if cv_likelihood: results.append(likelihood) else: results.append(None) if ground_truth is not None: results += [ari_gt, ami_gt] else: results += [None, None] if stability and ground_truth is not None: results.append(stab_gt) else: results.append(None) if corr_score is not None: results.append(corr) else: results.append(None) if corr_score is not None and ground_truth is not None: results.append(corr_gt) else: results.append(None) return results
def test_adjusted_rand_score_sparse(): labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) C_sparse = contingency_matrix(labels_a, labels_b, sparse=True) assert_almost_equal(adjusted_rand_score(labels_a, labels_b), adjusted_rand_score(None, None, contingency=C_sparse))