def affinity_propagation(crime_rows, column_names): """ damping : float, optional, default: 0.5 Damping factor between 0.5 and 1. convergence_iter : int, optional, default: 15 Number of iterations with no change in the number of estimated clusters that stops the convergence. max_iter : int, optional, default: 200 Maximum number of iterations. preference : array-like, shape (n_samples,) or float, optional Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities. affinity : string, optional, default=``euclidean`` Which affinity to use. At the moment precomputed and euclidean are supported. euclidean uses the negative squared euclidean distance between points. """ crime_xy = [crime[0:2] for crime in crime_rows] crime_info = [crime[2:] for crime in crime_rows] print("Running Affinity Propagation") # TODO: Parameterize this affinity_prop = AffinityPropagation() #affinity_propagation_labels = affinity_prop.fit_predict(crime_xy) affinity_prop.fit(random_sampling(crime_xy, num_samples=5000)) affinity_propagation_labels = affinity_prop.predict(crime_xy) print("formatting....") return _format_clustering(affinity_propagation_labels, crime_xy, crime_info, column_names)
def loadKmeansData(dataArrayTest,dataArrayTrain,k,m='load'): if m=='load': centroidRead=open('centroid','r') labelClusterRead=open('labelCluster','r') labelPreRead=open('labelPre','r') centroid=pickle.load(centroidRead) labelCluster=pickle.load(labelClusterRead) labelPre=pickle.load(labelPreRead) else: dataArrayTestNorm = preprocessing.normalize(dataArrayTest) dataArrayTrainNorm = preprocessing.normalize(dataArrayTrain) #clf=MiniBatchKMeans(init='k-means++', n_clusters=k, n_init=10) clf=AffinityPropagation() #clf=DBSCAN(min_samples=30) pre=clf.fit(dataArrayTrainNorm) centroid=pre.cluster_centers_ centroidWrite=open('centroid','w') #pickle.dump(centroid,centroidWrite) labelCluster=pre.labels_ labelClusterWrite=open('labelCluster','w') #pickle.dump(labelCluster,labelClusterWrite) labelPre=clf.predict(dataArrayTestNorm) labelPreWrite=open('labelPre','w') #pickle.dump(labelPre,labelPreWrite) return centroid,labelCluster,labelPre
def ap(): data_index = [ "dp_tr", "dp_t", "dq_tr", "dq_t", "du_tr", "du_t", "di_tr", "di_t", "dp_s", "dq_s", "du_s", "di_s", "dp_dq", "first_h", "third_h", "fifth_h", ] feature = readFeature(db) sample, n_com, pca_fit = do_pca() sample = pd.DataFrame(sample) sample["p_n"] = feature["p_n"].values sample_up = sample[sample.p_n == 1].iloc[:, 0:n_com] sample_down = sample[sample.p_n == 0].iloc[:, 0:n_com] start = time.time() print("start to do training") p = -0.5 af_up = AffinityPropagation(damping=0.5, preference=p).fit(sample_up) af_down = AffinityPropagation(damping=0.5, preference=p).fit(sample_down) print("Event number of starting appliances:", af_up.predict(sample_up)) print("Event number of stoping appliances:", af_down.predict(sample_down)) # feature['labels'] = af.labels_ saveModel2mdb( db, [ af_up, af_down, pca_fit, feature.loc[:, data_index].max(), feature.loc[:, data_index].min(), feature.loc[:, data_index].mean(), ], ) print("done with training take:", time.time() - start, "seconds")
def clustering_affinity_propagation(data_res): """ Executes sklearn's affinity propagation function with the given data frame """ af = AffinityPropagation() af.fit(data_res) predictions = af.predict(data_res) cluster_centers = af.cluster_centers_ return predictions, cluster_centers, af
med_def_distance(player_list) # make the features array features_array = np.column_stack( (dribbles_list, def_dist_list, shot_dist_list)) norm_features = normalize(features_array, axis=0) # create the model model = AffinityPropagation() model.fit(norm_features) labels = model.predict(norm_features) # check to make sure every player has a cluster # combine player's with their clusters print(player_list.shape) print(labels.shape) clusters = zip(player_list, labels) clusters = list(clusters) clusters = np.array(clusters) sorted_clusters = clusters[np.argsort(clusters[:, 1])] # print the full list of players and their respective clusters print(sorted_clusters)
mbk = MiniBatchKMeans(n_clusters=35) mbk.fit(y_toPredict) df['Cluster_From_mbk'] = mbk.predict(y_toPredict) #AgglomerativeClustering from sklearn.cluster import AgglomerativeClustering ac = AgglomerativeClustering(n_clusters=35) df['Cluster_From_ac'] = ac.fit_predict(y_toPredict) #Affinity Propagation from sklearn.cluster import AffinityPropagation ap = AffinityPropagation(preference=-35) ap.fit(y_toPredict) df['Cluster_From_ap'] = ap.predict(y_toPredict) #Creating lists from different clusters from each clustering algorithm nameListKmeans = [[None]] * num_cluster for i in range(0, num_cluster): nameListKmeans[i] = list() for index, row in df.iterrows(): nameListKmeans[row['Cluster_From_km']].append(row['Job_Title'] + "," + row['CompanyName']) nameListBirch = [[None]] * num_cluster for i in range(0, num_cluster): nameListBirch[i] = list() for index, row in df.iterrows(): nameListBirch[row['Cluster_From_birch']].append(row['Job_Title'] + "," +
def test_affinity_propagation_predict(): # Test AffinityPropagation.predict af = AffinityPropagation(affinity="euclidean") labels = af.fit_predict(X) labels2 = af.predict(X) assert_array_equal(labels, labels2)
print("Calculate AP codebook for quantization ...") af = AffinityPropagation().fit(resnet50_train_overall) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ AP_codebook_size = len(cluster_centers_indices) bovw_matrix_train=np.zeros((raw_matrix_train.shape[0],raw_matrix_train.shape[1])) for i in xrange(bovw_matrix_train.shape[0]): for j in xrange(bovw_matrix_train.shape[1]): current_frame_rawfeature=raw_matrix_train[i,j] current_frame_w=kmeans_codebook.predict(current_frame_rawfeature.reshape(1,-1))[0] current_frame_w=af.predict(current_frame_rawfeature.reshape(1,-1))[0] bovw_matrix_train[i,j]=int(current_frame_w) bovw_matrix_test=np.zeros((raw_matrix_test.shape[0],raw_matrix_test.shape[1])) for i in xrange(bovw_matrix_test.shape[0]): for j in xrange(bovw_matrix_test.shape[1]): current_frame_rawfeature=raw_matrix_test[i,j] current_frame_w=kmeans_codebook.predict(current_frame_rawfeature.reshape(1,-1))[0] current_frame_w=af.predict(current_frame_rawfeature.reshape(1,-1))[0] bovw_matrix_test[i,j]=int(current_frame_w)
0.32486058, 0.59739425, 0.24806902, 0.98009566, 0.44359849, 0.50656873, 0.65229741, 0.62491293, 0.48583292, 0.91480856, 0.22179706, 0.49348, 0.52367377, 0.73338162, 0.63712822, 0.39172376, 0.8466613, 0.84700926, 0.72830164, 1. ]]) y = [ 'INTJ', 'Jesus', 'ice', 'ENFP', 'Reagan', 'INTP', 'Badass', 'Charlie', 'Dennis', 'Mac', 'Cherry', 'ISTJ', 'God', 'Guns', 'Vanilla', 'Fire', 'seventeen', 'Frank', 'Dee', 'Snail' ] #print(X) af = AffinityPropagation(preference=0).fit(X) print(af.predict(X)) labels = af.labels_ print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels)) kmeans = KMeans(n_clusters=5, init='k-means++') kmeans.fit(X, y) print(kmeans.predict(X)) print(y) print(kmeans.score(X)) #fpc = fuzz.cluster.cmeans( X, 3, 2, error=0.005, maxiter=1000, init=None) # Store fpc values for later #print(fpc)
def doAffinity(X): model = AffinityPropagation(damping = 0.5, max_iter = 2, affinity = 'euclidean') model.fit(X) clust_labels2 = model.predict(X) cent2 = model.cluster_centers_ return (clust_labels2, cent2)
# affinity propagation clustering from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import AffinityPropagation from matplotlib import pyplot # define dataset X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) # define the model model = AffinityPropagation(damping=0.9) # fit the model model.fit(X) # assign a cluster to each example yhat = model.predict(X) # retrieve unique clusters clusters = unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot pyplot.show()
# img_in = Image.open('test.jpg') img_in = Image.open(sys.argv[1]) img_in = np.array(img_in, dtype=np.float64) / 255 w, h, d = original_shape = tuple(img_in.shape) assert d == 3 image_array = np.reshape(img_in, (w * h, d)) print("Fitting model on a small sub-sample of the data") image_array_sample = shuffle(image_array, random_state=0)[:500] print("Predicting color indices on the full image (affinity propagation)") t0 = time() af_prop = AffinityPropagation(max_iter=200, damping=0.9, convergence_iter=50).fit(image_array_sample) labels_af = af_prop.predict(image_array) print("done in %0.3fs." % (time() - t0)) def recreate_image(codebook, labels, w, h): """Recreate the (compressed) image from the code book & labels""" d = codebook.shape[1] image = np.zeros((w, h, d)) label_idx = 0 for i in range(w): for j in range(h): image[i][j] = codebook[labels[label_idx]] label_idx += 1 return image
不需要确定聚类的数量 缺点: 复杂度高,O(N^2 * T),T迭代次数,N样本数 适合中小数据集 ''' # 创建实例、训练模型,预测,计算后的质心 cluster = AffinityPropagation(damping=0.5, max_iter=500, convergence_iter=20, copy=True, preference=-50, affinity='euclidean', verbose=False) cluster.fit(X) Y_pred = cluster.predict(X) cluster.cluster_centers_ cluster.cluster_centers_indices_ # 簇中心的指示 # 画图 plt.subplot(221) plt.scatter(X[:, 0], X[:, 1], c=Y_pred) ''' damping 阻尼系数,在[0.5,1]之间 max_iter 最大迭代数 convergence_iter 停止收敛的估计簇数量没有变化的迭代次数 copy 复制输入数据 preference 很重要,数据点偏好 affinity 如何计算亲和度,比如欧氏距离 verbose 不懂 '''
mini_batch_valid_performance_metrics_for_plotting[item + 1] = mini_batch_valid_performance_metric_array[item] mini_batch_test_performance_metrics_for_plotting[item + 1] = mini_batch_test_performance_metric_array[item] Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(mini_batch_parameter_search_space_for_plotting, mini_batch_valid_performance_metrics_for_plotting, mini_batch_test_performance_metrics_for_plotting, 'Adjusted Mutual Information Score', 'MiniBatch K-Means Clustering n_init parameter', 'Mini_Batch_k-Means_Performance', 0, 0.5) # Do AffinityPropagation, optimizing damping over a validation set current_optimal_affinity_propagation_parameter = 0.5 initial_optimal_affinity_propagation_clusterer = AffinityPropagation(damping=current_optimal_affinity_propagation_parameter) initial_optimal_affinity_propagation_clusterer.fit(train_data_set) initial_affinity_propagation_valid_predictions = initial_optimal_affinity_propagation_clusterer.predict(valid_data_set) initial_affinity_propagation_test_predictions = initial_optimal_affinity_propagation_clusterer.predict(test_data_set) # Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix for element in range(number_of_valid_observations): initial_affinity_propagation_valid_predictions[element] += 1 for element in range(number_of_test_observations): initial_affinity_propagation_test_predictions[element] += 1 initial_affinity_propagation_valid_predictions = Clustering.Hungarian_Fix(initial_affinity_propagation_valid_predictions, valid_labels).astype('int') initial_affinity_propagation_test_predictions = Clustering.Hungarian_Fix(initial_affinity_propagation_test_predictions, test_labels).astype('int') # Set a starting point for optimality of the initial performance metric, to be possibly adjusted later affinity_propagation_parameter_integer_search_space_start = current_optimal_affinity_propagation_parameter + 0.05 affinity_propagation_parameter_integer_search_space_stop = current_optimal_affinity_propagation_parameter + 0.45
lowercase=True, stop_words="english") print("Vectorizing...") t0 = time() samples = dataset.data[:args.n_samples] counts = vectorizer.fit_transform(samples) tfidf = text.TfidfTransformer(norm="l2", use_idf=True).fit_transform(counts) print("done in %0.3fs." % (time() - t0)) # Fit the model print("Fitting the model on with n_samples=%d and n_features=%d..." % (args.n_samples, args.n_features)) t0 = time() d = Decomposition() nmf = d.fit(tfidf) print("done in %0.3fs." % (time() - t0)) # Fit the model print("Predicting labels...") t0 = time() labels = d.predict(tfidf) print("done in %0.3fs." % (time() - t0)) for sample, label in izip(samples, labels): print(sample, label)
clustering = AffinityPropagation().fit(df) # In[14]: AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True, damping=0.5, max_iter=200, preference=None, verbose=False) # In[15]: y_pred = clustering.predict(df) # In[16]: plt.scatter(df[" gdp_for_year ($) "], df["suicides_no"], c=y_pred) plt.title("Clusters") plt.xlabel("PIB") plt.ylabel("Número de suicídios") # In[29]: # Clustering utilizando el algoritmo aglomerativo # In[18]: preds = []
model8.fit(x) print('\nMeanShift:') print(model8.cluster_centers_) ypred8 = model8.predict(x) plt.figure(figsize=(12, 8)) plt.scatter(x[:, 0], x[:, 1], c=ypred8, cmap='jet') plt.xlabel('X') plt.ylabel('Y') plt.title('MeanShift', fontdict=dict(size=20, color='r')) model9 = SpectralClustering(n_clusters=4, eigen_solver=None, random_state=1, gamma=1, affinity='rbf', n_neighbors=10) ypred9 = model9.fit_predict(x) plt.figure(figsize=(12, 8)) plt.scatter(x[:, 0], x[:, 1], c=ypred9, cmap='rainbow') plt.xlabel('X') plt.ylabel('Y') plt.title('SpectralClustering', fontdict=dict(size=20, color='r')) model10 = AffinityPropagation(damping=.8) model10.fit(x) print('\nAffinityPropagation:') print(model10.cluster_centers_) ypred10 = model10.predict(x) plt.figure(figsize=(12, 8)) plt.scatter(x[:, 0], x[:, 1], c=ypred10, cmap='brg') plt.xlabel('X') plt.ylabel('Y') plt.title('AffinityPropagation', fontdict=dict(size=20, color='r')) plt.show()
file = open('./data/glass.data') next(file) X = [] y = [] for line in file.readlines(): curLine = line.strip().split(", ") X.append([float(i) for i in curLine[0:-1]]) y.append(curLine[-1].strip('.')) # iterate over classifiers------------------------------------------- glass_score = [] params = range(-90, 0, 5) for param in params: algorithm = AffinityPropagation(preference=param) algorithm.fit(X) y_pred = algorithm.predict(X) s = adjusted_rand_score(y, y_pred) glass_score.append(s) print('glass_score', glass_score) # draw score pic--------------------------------------- plt.figure(figsize=(6, 4), dpi=120) plt.grid() plt.xlabel('preference for AP') plt.xticks(params) plt.plot(params, glass_score, label='glass_score', color='g') plt.legend() plt.title("glass AP score") plt.savefig("img/AP.png")
agglomerative_model = AgglomerativeClustering(n_clusters=2) birch_model = Birch(threshold=0.03, n_clusters=2) dbscan_model = DBSCAN(eps=0.25, min_samples=9) kmeans_model = KMeans(n_clusters=2) mean_model = MeanShift() optics_model = OPTICS(eps=0.75, min_samples=10) gaussian_model = GaussianMixture(n_components=2) # train the model affinity_model.fit(training_data) birch_model.fit(training_data) kmeans_model.fit(training_data) gaussian_model.fit(training_data) # assign each data point to a cluster affinity_result = affinity_model.predict(training_data) agglomerative_result = agglomerative_model.fit_predict(training_data) birch_result = birch_model.predict(training_data) dbscan_result = dbscan_model.fit_predict(training_data) kmeans_result = kmeans_model.predict(training_data) mean_result = mean_model.fit_predict(training_data) optics_result = optics_model.fit_predict(training_data) gaussian_result = gaussian_model.predict(training_data) # get all of the unique clusters affinity_clusters = unique(affinity_result) agglomerative_clusters = unique(agglomerative_result) birch_clusters = unique(birch_result) dbscan_clusters = unique(dbscan_result) kmeans_clusters = unique(kmeans_result) mean_clusters = unique(mean_result)
def clusterPlot(data): if (NORMALIZE): data = preprocessing.normalize(data, norm=NORM) # print(vec.get_feature_names()) #reduced_data = PCA(n_components=2, whiten=True).fit_transform(data) reduced_data = umap.UMAP(n_neighbors=15).fit_transform(data) print(reduced_data) clustering = AffinityPropagation(random_state=0).fit(reduced_data) #clustering = KMeans(init="k-means++", n_clusters=4, n_init=2) #clustering.fit(reduced_data) #clustering = DBSCAN(eps=0.3, min_samples=10).fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = 0.001 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 0.2, reduced_data[:, 0].max() + 0.2 y_min, y_max = reduced_data[:, 1].min() - 0.2, reduced_data[:, 1].max() + 0.2 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = clustering.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(figsize=[30, 20]) plt.clf() plt.imshow( Z, interpolation="nearest", extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect="auto", origin="lower", ) plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=5) # Plot the centroids as a white X # centroids = kmeans.cluster_centers_ # plt.scatter( # centroids[:, 0], # centroids[:, 1], # marker="x", # s=169, # linewidths=3, # color="w", # zorder=10, # ) # plt.title("Clustering on the HITO software products (PCA-reduced data)"#"Centroids are marked with white cross") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) # cursor = mplcursors.cursor(hover=True) # cursor.connect("add", lambda sel: sel.annotation.set_text(D[sel.target.index]["uri"])) # ax = plt.figure().add_subplot(111,autoscale_on=True) texts = [] for i in range(len(D)): a = plt.text(reduced_data[i][0], reduced_data[i][1], E[i]["label"]) texts.append(a) if ADJUST_TEXT: adjust_text(texts, lim=10) plt.tight_layout() #plt.savefig("cluster-"+("classifiedonly-" if CLASSIFIED_ONLY else "")+NORM+".pdf", pad_inches=0) plt.savefig("cluster-bagofwords-" + ("classifiedonly-" if CLASSIFIED_ONLY else "") + NORM + ".pdf", pad_inches=0) plt.savefig("cluster.png", pad_inches=0) plt.show()
from sklearn.datasets import make_moons, make_circles, make_blobs from sklearn.cluster import AffinityPropagation import numpy as np centers = [[0, 1], [-1, -1], [1, -1]] X, y = make_blobs(n_samples=1500, random_state=170) trs = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] X = np.dot(X, trs) clt = AffinityPropagation(damping=.9) clt.fit(X) import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np #调整图片风格 mpl.style.use('fivethirtyeight') #定义xy网格,用于绘制等值线图 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) #预测可能性 Z = clt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) yp = clt.predict(X) plt.contourf(xx, yy, Z, alpha=.8) #绘制散点图 plt.scatter(X[:, 0], X[:, 1], c=yp, edgecolors='k') plt.axis("equal") plt.show()
data # In[38]: from sklearn.cluster import AffinityPropagation clustering = AffinityPropagation().fit(data) print(clustering) # In[ ]: # In[ ]: # In[42]: re = clustering.predict(data) # In[26]: le = len(clustering.cluster_centers_) cluster_centers_indices = clustering.cluster_centers_indices_ labels = clustering.labels_ n_clusters_ = len(cluster_centers_indices) print(n_clusters_) # In[27]: from sklearn.cluster import KMeans km = KMeans(n_clusters=le, random_state=1) new = data._get_numeric_data()
def main(): parser = argparse.ArgumentParser( description='This code contains the RF model building. ') # Required req_group = parser.add_argument_group(title='REQUIRED INPUT') req_group.add_argument( '-df_short_name', help= 'feature matrix, for Set B, use the short name, for Set A, use the full name of the expression matrix', required=True) req_group.add_argument('-path', help='path to the feature matrix', required=True) req_group.add_argument('-save_path', help='path to save the outputs', required=True) req_group.add_argument('-clustering_method', help='kmean, affinity, birch, or meanshift', required=True) req_group.add_argument('-test_gene_list', help='Genes_for_testing.txt', required=True) req_group.add_argument('-train_gene_list', help='Genes_for_training.txt', required=True) req_group.add_argument('-dataset', help='setA or setB', required=True) if len(sys.argv) == 1: parser.print_help() sys.exit(0) args = parser.parse_args() DF = args.df_short_name path = args.path save_path = args.save_path clustering_method = args.clustering_method TEST = args.test_gene_list TRAIN = args.train_gene_list dataset = args.dataset with open(TEST) as test_file: test = test_file.read().splitlines() with open(TRAIN) as training_file: training = training_file.read().splitlines() if dataset == 'setB': df = pd.read_csv(path + DF + '_CV_1_features.txt', sep='\t', index_col=0) short_name = DF if dataset == 'setA': expression = pd.read_csv(path + DF, sep='\t', index_col=0) pathway_annotation = pd.read_csv( 'Sly_pathway_annotation_20190117_with_expression_5_members_nonoverlapping.txt', sep='\t', index_col=1, header=None) pathway_annotation.columns = ['Class'] df = pd.concat([pathway_annotation, expression], axis=1) short_name = open( '/mnt/home/peipeiw/Documents/Pathway_prediction/20180827_all_EC_pathway/Short_name_for_expression_data.txt', 'r').readlines() D = {} for inl in short_name: D[inl.split('\t')[0]] = inl.split('\t')[1].strip() short_name = D[DF] y = df['Class'] classes = y.unique() df_test = df[df.index.isin(test)] y_test = df_test['Class'] X_test = df_test.drop(['Class'], axis=1) df_training = df[df.index.isin(training)] y_training = df_training['Class'] X_training = df_training.drop(['Class'], axis=1) test_classes = y_test.unique() if clustering_method.lower() == 'kmean': for n_clusters in [5, 10, 25, 50, 85, 100, 200, 300, 400, 500]: accuracies = [] accuracies_ho = [] f1_array = np.array( [np.insert(arr=classes.astype(np.str), obj=0, values='M')]) f1_array_ho = np.array([ np.insert(arr=test_classes.astype(np.str), obj=0, values='M') ]) for cv_number in range(1, 6): if dataset == 'setB': df = pd.read_csv(path + DF + '_CV_%s_features.txt' % cv_number, sep='\t', index_col=0) with open('Genes_for_5_training_set%s.txt' % cv_number) as train_file: train = train_file.read().splitlines() with open('Genes_for_5_validation_set%s.txt' % cv_number) as validation_file: validation = validation_file.read().splitlines() df_train = df[df.index.isin(train)] df_validation = df[df.index.isin(validation)] X_train = df_train.drop(['Class'], axis=1) X_validation = df_validation.drop(['Class'], axis=1) y_train = df_train['Class'] y_validation = df_validation['Class'] mat = X_train.as_matrix() # Convert DataFrame to matrix mat_validation = X_validation.as_matrix() mat_test = X_test.as_matrix() clu = sklearn.cluster.KMeans(n_clusters=n_clusters, n_init=3, n_jobs=5, max_iter=500) # Using sklearn clu.fit(mat) train_labels = clu.labels_ # Get cluster assignment labels train_tem = pd.DataFrame([train_labels ]).T # Format results as a DataFrame train_tem.index = X_train.index train_tem.columns = ['Cluster'] train_res = pd.concat([y_train, train_tem], axis=1) E_C_P = Enrichment_clustering(train_res, n_clusters) joblib.dump( clu, save_path + short_name + "_Kmeans_%s_%s_%s.pkl" % (dataset, cv_number, n_clusters)) cv_labels = clu.predict(mat_validation) cv_tem = pd.DataFrame([cv_labels]).T cv_tem.index = X_validation.index cv_tem.columns = ['Cluster'] cv_res = pd.concat([y_validation, cv_tem], axis=1) for i in range(0, cv_res.shape[0]): try: cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]] except: cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1] print('%s was not enriched for any pathway' % cv_res.iloc[i, 1]) if cv_number == 1: predicted = cv_res.copy() else: predicted = pd.concat([predicted, cv_res], axis=0) result = Performance_MC(cv_res.Class, cv_res.Cluster, classes) if 'accuracy' in result: accuracies.append(result['accuracy']) if 'macro_f1' in result: f1_temp_array = np.insert(arr=result['f1_MC'], obj=0, values=result['macro_f1']) f1_array = np.append(f1_array, [f1_temp_array], axis=0) test_labels = clu.predict(mat_test) test_tem = pd.DataFrame([test_labels]).T test_tem.index = X_test.index test_tem.columns = ['Cluster'] test_res = pd.concat([y_test, test_tem], axis=1) for i in range(0, test_res.shape[0]): try: test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]] except: test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1] print('%s was not enriched for any pathway' % test_res.iloc[i, 1]) if cv_number == 1: predicted_test = test_res.copy() else: predicted_test = pd.concat( [predicted_test, test_res.Cluster], axis=1) ho_result = Performance_MC(test_res.Class, test_res.Cluster, test_classes) if 'accuracy' in ho_result: accuracies_ho.append(ho_result['accuracy']) if 'macro_f1' in ho_result: ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'], obj=0, values=ho_result['macro_f1']) f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array], axis=0) f1 = pd.DataFrame(f1_array) f1.columns = f1.iloc[0] f1 = f1[1:] f1.columns = [str(col) + '_F1' for col in f1.columns] f1 = f1.astype(float) # Calculate accuracy and f1 stats AC = np.mean(accuracies) AC_std = np.std(accuracies) MacF1 = f1['M_F1'].mean() MacF1_std = f1['M_F1'].std() print('Save the predicted values:') predicted.to_csv(save_path + short_name + "_Kmean_%s_%s_validation_prediction.txt" % (dataset, n_clusters), index=True, header=True, sep="\t") predicted_test.to_csv(save_path + short_name + "_Kmean_%s_%s_test_prediction.txt" % (dataset, n_clusters), index=True, header=True, sep="\t") print( "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC, AC_std, MacF1, MacF1_std)) # Unpack results for test f1_ho = pd.DataFrame(f1_array_ho) f1_ho.columns = f1_ho.iloc[0] f1_ho = f1_ho[1:] f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns] f1_ho = f1_ho.astype(float) AC_ho = np.mean(accuracies_ho) AC_std_ho = np.std(accuracies_ho) MacF1_ho = f1_ho['M_F1'].mean() MacF1_std_ho = f1_ho['M_F1'].std() print( "\nCluster results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) # Save detailed results file n_features = df.shape[1] - 1 if clustering_method.lower() == 'kmean': out = open( save_path + short_name + "_Kmean_%s_%s_results.txt" % (dataset, n_clusters), 'w') if clustering_method.lower() == 'affinity': out = open( save_path + short_name + "_AffinityPropagation_%s_%s_%s_results.txt" % (dataset, damping, n_clusters), 'w') out.write('\n\nResults for prediction on validation set:\n') out.write( 'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n' % (AC, AC_std, MacF1, MacF1_std)) for cla in f1.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1[cla]), np.std(f1[cla]))) # Add results for test out.write('\n\nResults for the test set:\n') out.write( 'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) for cla in f1_ho.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla]))) out.close() if clustering_method.lower() == 'affinity': for damping in [0.5, 0.6, 0.7, 0.8, 0.9, 0.99]: accuracies = [] accuracies_ho = [] f1_array = np.array( [np.insert(arr=classes.astype(np.str), obj=0, values='M')]) accuracies_ho = [] f1_array_ho = np.array([ np.insert(arr=test_classes.astype(np.str), obj=0, values='M') ]) for cv_number in range(1, 6): if dataset == 'setB': df = pd.read_csv(path + DF + '_CV_%s_features.txt' % cv_number, sep='\t', index_col=0) with open('Genes_for_5_training_set%s.txt' % cv_number) as train_file: train = train_file.read().splitlines() with open('Genes_for_5_validation_set%s.txt' % cv_number) as validation_file: validation = validation_file.read().splitlines() df_train = df[df.index.isin(train)] df_validation = df[df.index.isin(validation)] X_train = df_train.drop(['Class'], axis=1) X_validation = df_validation.drop(['Class'], axis=1) y_train = df_train['Class'] y_validation = df_validation['Class'] mat = X_train.as_matrix() # Convert DataFrame to matrix mat_validation = X_validation.as_matrix() mat_test = X_test.as_matrix() clu = AffinityPropagation(damping=damping) clu.fit(mat) train_labels = clu.labels_ # Get cluster assignment labels n_clusters = len(np.unique(train_labels)) train_tem = pd.DataFrame([train_labels ]).T # Format results as a DataFrame train_tem.index = X_train.index train_tem.columns = ['Cluster'] train_res = pd.concat([y_train, train_tem], axis=1) E_C_P = Enrichment_clustering(train_res, n_clusters) joblib.dump( clu, save_path + short_name + "_AffinityPropagation_%s_%s_%s.pkl" % (dataset, cv_number, damping)) cv_labels = clu.predict(mat_validation) cv_tem = pd.DataFrame([cv_labels]).T cv_tem.index = X_validation.index cv_tem.columns = ['Cluster'] cv_res = pd.concat([y_validation, cv_tem], axis=1) for i in range(0, cv_res.shape[0]): try: cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]] except: cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1] print('%s was not enriched for any pathway' % cv_res.iloc[i, 1]) if cv_number == 1: predicted = cv_res.copy() else: predicted = pd.concat([predicted, cv_res], axis=0) result = Performance_MC(cv_res.Class, cv_res.Cluster, classes) if 'accuracy' in result: accuracies.append(result['accuracy']) if 'macro_f1' in result: f1_temp_array = np.insert(arr=result['f1_MC'], obj=0, values=result['macro_f1']) f1_array = np.append(f1_array, [f1_temp_array], axis=0) test_labels = clu.predict(mat_test) test_tem = pd.DataFrame([test_labels]).T test_tem.index = X_test.index test_tem.columns = ['Cluster'] test_res = pd.concat([y_test, test_tem], axis=1) for i in range(0, test_res.shape[0]): try: test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]] except: test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1] print('%s was not enriched for any pathway' % test_res.iloc[i, 1]) if cv_number == 1: predicted_test = test_res.copy() else: predicted_test = pd.concat( [predicted_test, test_res.Cluster], axis=1) ho_result = Performance_MC(test_res.Class, test_res.Cluster, test_classes) if 'accuracy' in ho_result: accuracies_ho.append(ho_result['accuracy']) if 'macro_f1' in ho_result: ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'], obj=0, values=ho_result['macro_f1']) f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array], axis=0) f1 = pd.DataFrame(f1_array) f1.columns = f1.iloc[0] f1 = f1[1:] f1.columns = [str(col) + '_F1' for col in f1.columns] f1 = f1.astype(float) # Calculate accuracy and f1 stats AC = np.mean(accuracies) AC_std = np.std(accuracies) MacF1 = f1['M_F1'].mean() MacF1_std = f1['M_F1'].std() print('Save the predicted values:') predicted.to_csv( save_path + short_name + "_AffinityPropagation_%s_%s_%s_validation_prediction.txt" % (dataset, damping, n_clusters), index=True, header=True, sep="\t") predicted_test.to_csv( save_path + short_name + "_AffinityPropagation_%s_%s_%s_test_prediction.txt" % (dataset, damping, n_clusters), index=True, header=True, sep="\t") print( "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC, AC_std, MacF1, MacF1_std)) # Unpack results for test f1_ho = pd.DataFrame(f1_array_ho) f1_ho.columns = f1_ho.iloc[0] f1_ho = f1_ho[1:] f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns] f1_ho = f1_ho.astype(float) AC_ho = np.mean(accuracies_ho) AC_std_ho = np.std(accuracies_ho) MacF1_ho = f1_ho['M_F1'].mean() MacF1_std_ho = f1_ho['M_F1'].std() print( "\nCluster results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) # Save detailed results file n_features = df.shape[1] - 1 if clustering_method.lower() == 'kmean': out = open( save_path + short_name + "_Kmean_%s_%s_results.txt" % (dataset, n_clusters), 'w') if clustering_method.lower() == 'affinity': out = open( save_path + short_name + "_AffinityPropagation_%s_%s_%s_results.txt" % (dataset, damping, n_clusters), 'w') out.write('\n\nResults for prediction on validation set:\n') out.write( 'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n' % (AC, AC_std, MacF1, MacF1_std)) for cla in f1.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1[cla]), np.std(f1[cla]))) # Add results for test out.write('\n\nResults for test set:\n') out.write( 'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) for cla in f1_ho.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla]))) out.close() if clustering_method.lower() == 'birch': for n_clusters in [5, 10, 25, 50, 85, 100, 200, 300, 400, 500]: accuracies = [] accuracies_ho = [] f1_array = np.array( [np.insert(arr=classes.astype(np.str), obj=0, values='M')]) accuracies_ho = [] f1_array_ho = np.array([ np.insert(arr=test_classes.astype(np.str), obj=0, values='M') ]) for cv_number in range(1, 6): if dataset == 'setB': df = pd.read_csv(path + DF + '_CV_%s_features.txt' % cv_number, sep='\t', index_col=0) with open('Genes_for_5_training_set%s.txt' % cv_number) as train_file: train = train_file.read().splitlines() with open('Genes_for_5_validation_set%s.txt' % cv_number) as validation_file: validation = validation_file.read().splitlines() df_train = df[df.index.isin(train)] df_validation = df[df.index.isin(validation)] X_train = df_train.drop(['Class'], axis=1) X_validation = df_validation.drop(['Class'], axis=1) y_train = df_train['Class'] y_validation = df_validation['Class'] mat = X_train.as_matrix() # Convert DataFrame to matrix mat_validation = X_validation.as_matrix() mat_test = X_test.as_matrix() clu = Birch(n_clusters=n_clusters) clu.fit(mat) train_labels = clu.labels_ # Get cluster assignment labels n_clusters = len(np.unique(train_labels)) train_tem = pd.DataFrame([train_labels ]).T # Format results as a DataFrame train_tem.index = X_train.index train_tem.columns = ['Cluster'] train_res = pd.concat([y_train, train_tem], axis=1) E_C_P = Enrichment_clustering(train_res, n_clusters) joblib.dump( clu, save_path + short_name + "_Birch_%s_%s_%s.pkl" % (dataset, cv_number, n_clusters)) cv_labels = clu.predict(mat_validation) cv_tem = pd.DataFrame([cv_labels]).T cv_tem.index = X_validation.index cv_tem.columns = ['Cluster'] cv_res = pd.concat([y_validation, cv_tem], axis=1) for i in range(0, cv_res.shape[0]): try: cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]] except: cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1] print('%s was not enriched for any pathway' % cv_res.iloc[i, 1]) if cv_number == 1: predicted = cv_res.copy() else: predicted = pd.concat([predicted, cv_res], axis=0) result = Performance_MC(cv_res.Class, cv_res.Cluster, classes) if 'accuracy' in result: accuracies.append(result['accuracy']) if 'macro_f1' in result: f1_temp_array = np.insert(arr=result['f1_MC'], obj=0, values=result['macro_f1']) f1_array = np.append(f1_array, [f1_temp_array], axis=0) test_labels = clu.predict(mat_test) test_tem = pd.DataFrame([test_labels]).T test_tem.index = X_test.index test_tem.columns = ['Cluster'] test_res = pd.concat([y_test, test_tem], axis=1) for i in range(0, test_res.shape[0]): try: test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]] except: test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1] print('%s was not enriched for any pathway' % test_res.iloc[i, 1]) if cv_number == 1: predicted_test = test_res.copy() else: predicted_test = pd.concat( [predicted_test, test_res.Cluster], axis=1) ho_result = Performance_MC(test_res.Class, test_res.Cluster, test_classes) if 'accuracy' in ho_result: accuracies_ho.append(ho_result['accuracy']) if 'macro_f1' in ho_result: ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'], obj=0, values=ho_result['macro_f1']) f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array], axis=0) f1 = pd.DataFrame(f1_array) f1.columns = f1.iloc[0] f1 = f1[1:] f1.columns = [str(col) + '_F1' for col in f1.columns] f1 = f1.astype(float) # Calculate accuracy and f1 stats AC = np.mean(accuracies) AC_std = np.std(accuracies) MacF1 = f1['M_F1'].mean() MacF1_std = f1['M_F1'].std() print('Save the predicted values:') predicted.to_csv(save_path + short_name + "_Birch_%s_%s_validation_prediction.txt" % (dataset, n_clusters), index=True, header=True, sep="\t") predicted_test.to_csv(save_path + short_name + "_Birch_%s_%s_test_prediction.txt" % (dataset, n_clusters), index=True, header=True, sep="\t") print( "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC, AC_std, MacF1, MacF1_std)) # Unpack results for test f1_ho = pd.DataFrame(f1_array_ho) f1_ho.columns = f1_ho.iloc[0] f1_ho = f1_ho[1:] f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns] f1_ho = f1_ho.astype(float) AC_ho = np.mean(accuracies_ho) AC_std_ho = np.std(accuracies_ho) MacF1_ho = f1_ho['M_F1'].mean() MacF1_std_ho = f1_ho['M_F1'].std() print( "\nCluster Results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) # Save detailed results file n_features = df.shape[1] - 1 out = open( save_path + short_name + "_Birch_%s_%s_results.txt" % (dataset, n_clusters), 'w') out.write('\n\nResults for prediction on validation set:\n') out.write( 'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n' % (AC, AC_std, MacF1, MacF1_std)) for cla in f1.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1[cla]), np.std(f1[cla]))) # Add results for test out.write('\n\nResults for test set:\n') out.write( 'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) for cla in f1_ho.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla]))) out.close() if clustering_method.lower() == 'meanshift': for bandwidth in [0.01, 0.05, 0.1, 0.5, 1]: accuracies = [] accuracies_ho = [] f1_array = np.array( [np.insert(arr=classes.astype(np.str), obj=0, values='M')]) accuracies_ho = [] f1_array_ho = np.array([ np.insert(arr=test_classes.astype(np.str), obj=0, values='M') ]) for cv_number in range(1, 6): if dataset == 'setB': df = pd.read_csv(path + DF + '_CV_%s_features.txt' % cv_number, sep='\t', index_col=0) with open('Genes_for_5_training_set%s.txt' % cv_number) as train_file: train = train_file.read().splitlines() with open('Genes_for_5_validation_set%s.txt' % cv_number) as validation_file: validation = validation_file.read().splitlines() df_train = df[df.index.isin(train)] df_validation = df[df.index.isin(validation)] X_train = df_train.drop(['Class'], axis=1) X_validation = df_validation.drop(['Class'], axis=1) y_train = df_train['Class'] y_validation = df_validation['Class'] mat = X_train.as_matrix() # Convert DataFrame to matrix mat_validation = X_validation.as_matrix() mat_test = X_test.as_matrix() clu = MeanShift( bandwidth=bandwidth, cluster_all=True ) # cluster_all=True forces the assignment of all instance. if cluster_all=False, orphans are given cluster label -1 clu.fit(mat) train_labels = clu.labels_ # Get cluster assignment labels n_clusters = len(np.unique(train_labels)) train_tem = pd.DataFrame([train_labels ]).T # Format results as a DataFrame train_tem.index = X_train.index train_tem.columns = ['Cluster'] train_res = pd.concat([y_train, train_tem], axis=1) E_C_P = Enrichment_clustering(train_res, n_clusters) joblib.dump( clu, save_path + short_name + "_MeanShift_%s_%s_%s.pkl" % (dataset, cv_number, bandwidth)) cv_labels = clu.predict(mat_validation) cv_tem = pd.DataFrame([cv_labels]).T cv_tem.index = X_validation.index cv_tem.columns = ['Cluster'] cv_res = pd.concat([y_validation, cv_tem], axis=1) for i in range(0, cv_res.shape[0]): try: cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]] except: cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1] print('%s was not enriched for any pathway' % cv_res.iloc[i, 1]) if cv_number == 1: predicted = cv_res.copy() else: predicted = pd.concat([predicted, cv_res], axis=0) result = Performance_MC(cv_res.Class, cv_res.Cluster, classes) if 'accuracy' in result: accuracies.append(result['accuracy']) if 'macro_f1' in result: f1_temp_array = np.insert(arr=result['f1_MC'], obj=0, values=result['macro_f1']) f1_array = np.append(f1_array, [f1_temp_array], axis=0) test_labels = clu.predict(mat_test) test_tem = pd.DataFrame([test_labels]).T test_tem.index = X_test.index test_tem.columns = ['Cluster'] test_res = pd.concat([y_test, test_tem], axis=1) for i in range(0, test_res.shape[0]): try: test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]] except: test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1] print('%s was not enriched for any pathway' % test_res.iloc[i, 1]) if cv_number == 1: predicted_test = test_res.copy() else: predicted_test = pd.concat( [predicted_test, test_res.Cluster], axis=1) ho_result = Performance_MC(test_res.Class, test_res.Cluster, test_classes) if 'accuracy' in ho_result: accuracies_ho.append(ho_result['accuracy']) if 'macro_f1' in ho_result: ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'], obj=0, values=ho_result['macro_f1']) f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array], axis=0) f1 = pd.DataFrame(f1_array) f1.columns = f1.iloc[0] f1 = f1[1:] f1.columns = [str(col) + '_F1' for col in f1.columns] f1 = f1.astype(float) # Calculate accuracy and f1 stats AC = np.mean(accuracies) AC_std = np.std(accuracies) MacF1 = f1['M_F1'].mean() MacF1_std = f1['M_F1'].std() print('Save the predicted values:') predicted.to_csv(save_path + short_name + "_MeanShift_%s_%s_validation_prediction.txt" % (dataset, bandwidth), index=True, header=True, sep="\t") predicted_test.to_csv(save_path + short_name + "_MeanShift_%s_%s_test_prediction.txt" % (dataset, bandwidth), index=True, header=True, sep="\t") print( "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC, AC_std, MacF1, MacF1_std)) # Unpack results for test f1_ho = pd.DataFrame(f1_array_ho) f1_ho.columns = f1_ho.iloc[0] f1_ho = f1_ho[1:] f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns] f1_ho = f1_ho.astype(float) AC_ho = np.mean(accuracies_ho) AC_std_ho = np.std(accuracies_ho) MacF1_ho = f1_ho['M_F1'].mean() MacF1_std_ho = f1_ho['M_F1'].std() print( "\nCluster results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n" % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) # Save detailed results file n_features = df.shape[1] - 1 out = open( save_path + short_name + "_Birch_%s_%s_results.txt" % (dataset, n_clusters), 'w') out.write('\n\nResults for prediction on validation set:\n') out.write( 'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n' % (AC, AC_std, MacF1, MacF1_std)) for cla in f1.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1[cla]), np.std(f1[cla]))) # Add results for test out.write('\n\nResults for test set:\n') out.write( 'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho)) for cla in f1_ho.columns: if 'M_F1' not in cla: out.write('%s\t%05f\t%05f\n' % (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla]))) out.close()
def Affinity_Propagation(x, y): aff = AffinityPropagation().fit(x) pred = aff.predict(x) NMI = metrics.normalized_mutual_info_score(y, pred) print("Affinity_Propagation:", NMI)
#agglomerative clustering in sklearn gives us lots of distance/proximity metrics to choose from #these are defined in the affinity variable #linkage parameter had to be average or complete rather than ward in order to use non-euclidean distance metrics agglo = AgglomerativeClustering(n_clusters=4, affinity='l1', linkage='average') cAssign = agglo.fit_predict(df) plt.scatter(df['x'], df['y'], c=cAssign, cmap='tab10') # In[73]: #http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation affProp = AffinityPropagation() affProp.fit(df) cAssign = affProp.predict(df) plt.scatter(df['x'], df['y'], c=cAssign, cmap='tab10') # In[74]: #documentation: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering from sklearn.cluster import SpectralClustering spectral = SpectralClustering() cAssign = spectral.fit_predict(df) plt.scatter(df['x'], df['y'], c=cAssign, cmap='tab10') # In[80]: #http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture
f_list_2.append(sp_centroid) f_list_2.append(sp_bandwidth) f_list_2.append(sp_contrast) f_list_2.append(sp_rolloff) f_np_2 = np.array(f_list_2) f_np_2 = np.transpose(f_np_2) f_np_3 = np.array(mfcc) f_np_4 = np.array(chroma_stft) master = np.concatenate([f_np_1, f_np_2, f_np_3, f_np_4], axis=1) cluster_obj = AffinityPropagation().fit(master) #cluster_obj = KMeans(n_clusters = 2 ,random_state=0).fit(master) #print("Number of clusters : " + str(len(cluster_obj.cluster_centers_indices_))) res = cluster_obj.predict(master) #print(cluster_obj.get_params()) s = res[0] t = 0.0 time = [] speaker = [] time.append(t) speaker.append(s) for u in range(0, len(res), 1): if (res[u] == s): t = t + 0.2 else: t = t + 0.2 s = res[u] speaker.append(s) time.append(t)
def ap_cluster_k(x, K, preference_init=-1.0, max_iter=30, c=None, iter_finetune=10): ''' Clustering of x by affinity propagation which the number of cluster is K. args: x (ndarray): Data matrix. K (int): Target number of clusters. max_iter (int): Number of trials for bisection search. c (ndarray, optional): Class labels of x. If this parameter is specified, the function try to find the better solution by random search. iter_finetune (int): Number of steps for the random search. ''' # first, search rough lower bound of the preference assert preference_init < 0, "preference_init must be negative." p = float(preference_init) # preference parameter p_upper = 0 for i in range(5): ap = AffinityPropagation(preference=p).fit(y) k_current = len(ap.cluster_centers_indices_) if k_current > K: p_upper = p k_upper = k_current p *= 10 else: p_lower = p k_lower = k_current break else: raise RuntimeError("Can't find initial lower bound for preference." " Try another value of p_initial.") # search the preference by bisection method for i in range(max_iter): p = (p_lower + p_upper) / 2 ap = AffinityPropagation(preference=p).fit(y) k_current = len(ap.cluster_centers_indices_) print('K = {}, k_current = {}, p = {}'.format(K, k_current, p)) print('{}:{}, {}:{}, {}:{}'.format(k_lower, p_lower, k_current, p, k_upper, p_upper)) # if the current k goes out of bounds then retry with perturbed p while k_current < k_lower or k_current > k_upper: print("retry") p += np.random.uniform(p_lower, p_upper) / 10 ap = AffinityPropagation(preference=p).fit(y) k_current = len(ap.cluster_centers_indices_) print('K = {}, k_current = {}, p = {}'.format(K, k_current, p)) print('{}:{}, {}:{}, {}:{}'.format(k_lower, p_lower, k_current, p, k_upper, p_upper)) if k_current < K: p_lower = p k_lower = k_current elif k_current > K: p_upper = p k_upper = k_current else: break else: raise RuntimeError("Can't find a preference to form K clusters." " Try another value of p_initial.") if c is None: return ap # Search further better preference in terms of NMI score by random search p_best = p score_best = normalized_mutual_info_score(c, ap.predict(y)) print('initial score:', score_best) print() for i in range(iter_finetune): p = np.random.normal(p_best, (p_upper - p_lower) / 2) if p < p_lower or p > p_upper: # where p is rejected print('reject') continue ap = AffinityPropagation(preference=p).fit(y) k_current = len(ap.cluster_centers_indices_) if k_current < K and p > p_lower: p_lower = p elif k_current > K and p < p_upper: p_upper = p else: # wgere k_current is K score = normalized_mutual_info_score(c, ap.predict(y)) if score > score_best: print("update p {} -> {}".format(p_best, p)) p_best = p score_best = score print('p: {}, {}, {}'.format(p_lower, p, p_upper)) print('score: {}'.format(score_best)) print() return AffinityPropagation(preference=p_best).fit(y)
metrics.f1_score(y_test, y_predicted, average='macro', zero_division='warn', labels=np.unique(y_predicted))) print(classification_report(y_test, y_predicted)) #applying AffinityPropagation Clustering Algorithm print('test_data.shape') print(test_data.shape) print('y_test.shape') print(y_test.shape) clustering = AffinityPropagation(random_state=5).fit(df) y_predicted = clustering.predict(test_data) print(' y predicted shape') print(y_predicted.shape) print("y_test shape") print(y_test.shape) print(y_test) print("y_predicted shape") print(y_predicted.shape) print(y_predicted) print(y_test.shape) print(type(y_test)) print(y_predicted.shape)
path = 'data/' + filename + '.txt' data = pd.read_csv(path, delimiter='\t') columns = ['X', 'Y', 'Z'] X = data[columns] data.isnull().sum() from sklearn.preprocessing import MinMaxScaler obj = MinMaxScaler() X_scaled = obj.fit_transform(X) from sklearn.cluster import AffinityPropagation model = AffinityPropagation() model.fit(X_scaled) y_pred = model.predict(X_scaled) clusters = np.unique(y_pred) data = X.join(pd.DataFrame({'Cluster': y_pred})) data.to_csv('data/clusters/' + filename + '.csv', index=None) # clusters = np.unique(y_pred) # for cluster in clusters: # row_idx = np.where(y_pred == cluster) # X.loc[row_idx].to_csv('clusters/' + str(cluster) + '_' + filename + '.csv', index=None) # plt.scatter(X.loc[row_idx]['X'],X.loc[row_idx]['Y']) # plt.show()