def colorClustering(self, HSV): if "No matched user" in HSV: centroid = "error:No matched user or No matched purchase list" else: ## Cluster 개수 n_hsv, n_ctrl = len(self.hsv_list), 7 if n_hsv == 1: n_clusters = 2 elif n_hsv / n_ctrl <= 2: if n_hsv < 5: n_clusters = n_hsv else: n_clusters = 5 else: n_clusters = (n_hsv // n_ctrl) + 5 ## Clustering Dataset 생성 X = HSV[:, :, :3].reshape(HSV.shape[0] * HSV.shape[1], HSV.shape[2]) ## 알고리즘 선정 : Hierarchical algorithm = AgglomerativeClustering(n_clusters=n_clusters, affinity="euclidean") ## Clustering 실행 with ignore_warnings(category=UserWarning): algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) ## centroid(중심점)찾기 clf = NearestCentroid() centroid = clf.fit(X, y_pred).centroids_ return centroid
def get_silhouette_score(df, X, n_clusters, model='KM'): ''' Calculate silhouette score for clustered dataframe. :param df: dataframe to cluster :param X: dense binary array for silhouette scoring :param n_clusters: number of clusters for model to cluster data into :param model: the clustering algorithm to be applied to the data, default = 'KM' (k-modes) :returns: silhouette score ''' # Initialize clusterer and set random state, if possible if model == 'AG': clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average').fit(X) labels = clusterer.labels_ sil_avg = silhouette_score(X, labels, metric='hamming') elif model == 'KM': clusterer = kmodes.KModes(n_clusters=n_clusters, n_init=5, init='Huang', verbose=1) labels = clusterer.fit_predict(df) sil_avg = silhouette_score(X, labels, metric='hamming') elif model == 'GM': clusterer = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=20, n_init=50, random_state=42, verbose=1).fit(X) labels = clusterer.predict(X) sil_avg = silhouette_score(X, labels, metric='hamming') return sil_avg
class PureStylo: def train(self, bookset): self.agg = AgglomerativeClustering(n_clusters = len(bookset)) bookX = [] for b in bookset: databook = ngrams(b, self.gramn) fdist = FreqDist(databook) common = fdist.most_common(100) inputlist = [] for c in common: inputlist.append(c[0]) inputlist.append(c[1]) bookX.append(inputlist) self.agg.fit(bookX) def classify(self, book): grams = ngrams(book, self.gramn) fdist = FreqDist(grams) common = fdist.most_common(100) X = [] for c in common: X.append(c[0]) X.append(c[1]) return self.agg.predict(X) def __init__(self, gramn): self.gramn = gramn
def check_cluster_sizes_vs_hclust(self): print(f"here is the cluster sizes we're guessing w kmeans: {self.train_cluster_count_plot}") hclust = AgglomerativeClustering(n_clusters=len(self.train_clusters["cluster"].unique())) hclust.fit(self.X) hclust_clusters = pd.DataFrame( {"train_row": range(self.X.shape[0]), "cluster": hclust.predict(self.X)}) hclust_cluster_count_plot = ggplot(hclust_clusters, aes("cluster")) + \ geom_bar() print(hclust_cluster_count_plot)
def fitAndPredict(self): startTime = time.time() k = 10 agglomerative = AgglomerativeClustering(n_clusters=k) agglomerative = agglomerative.fit(self.featureVectorList) self.kMeanslabels = agglomerative.predict(self.featureVectorList) print("Clustered using KMeans in [%.3f seconds]" % (time.time() - startTime)) self.kMeanscentroids = agglomerative.cluster_centers_ labels = agglomerative.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_)
def my_Kmeans(x, y, k=4, time=10, return_NMI=False): x = np.array(x) x = np.squeeze(x) y = np.array(y) if len(y.shape) > 1: y = np.argmax(y, axis=1) estimator = AgglomerativeClustering(affinity='cosine', compute_full_tree='auto', connectivity=None, linkage='complete', memory=None, n_clusters=k) ARI_list = [] # adjusted_rand_score( NMI_list = [] silhouette_score_list = [] if time: for i in range(time): # estimator.fit(x, y) y_pred = estimator.fit_predict(x, y) score = normalized_mutual_info_score(y, y_pred) NMI_list.append(score) s2 = adjusted_rand_score(y, y_pred) ARI_list.append(s2) # silhouette_score labels = estimator.labels_ s3 = silhouette_score(x, labels, metric='euclidean') silhouette_score_list.append(s3) # print('NMI_list: {}'.format(NMI_list)) score = sum(NMI_list) / len(NMI_list) s2 = sum(ARI_list) / len(ARI_list) s3 = sum(silhouette_score_list) / len(silhouette_score_list) print( 'NMI (10 avg): {:.4f} , ARI (10avg): {:.4f}, silhouette(10avg): {:.4f}' .format(score, s2, s3)) else: estimator.fit(x, y) y_pred = estimator.predict(x) score = normalized_mutual_info_score(y, y_pred) print("NMI on all label data: {:.5f}".format(score)) if return_NMI: return score
def clustering(idTfidf, num_clu, term_num): docFeature = idTfidf vecTfidf = {} for file in idTfidf: row = np.zeros(len(idTfidf[file])) col = idTfidf[file].keys() val = idTfidf[file].values() vec = csc_matrix((np.array(val), (np.array(row), np.array(col))), shape=(1, term_num)) vecTfidf[file] = vec.todense().tolist()[0] # print vecTfidf features = vecTfidf.values() # print features selection = 'GM' # selecting model here!!! Options: AgglomerativeClustering as AC, SpectralClustering as SC, GMM if selection == 'AC': model = AC(n_clusters=num_clu, affinity='cosine', linkage='average') if selection == 'SC': model = SC(n_clusters=num_clu, affinity='cosine') if selection == 'GMM': model = GMM(n_components=num_clu, covariance_type='full') if selection == 'GM': model = GM(n_components=num_clu) model.fit(features) res = model.predict(features) else: res = model.fit_predict(features) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(int(docFeature.keys()[i])) else: resDic[res[i]].append(int(docFeature.keys()[i])) result = resDic.values() # print result with open('gt_GMRes.json', 'w') as f: f.write(json.dumps(result)) return result
class clu_model(): def __init__(self,method = 'kmeans'): self.method = method self.clu_model = None self.para = None def fit(self,x,para = None): if self.method == 'kmeans': self.clu_model = KMeans(para) self.para = para self.clu_model.fit(x) elif self.method == 'DBSCAN':#密度聚类 self.clu_model = DBSCAN(para) self.para = para self.clu_model.fit(x) elif self.method =='Agg':#凝聚聚类 self.clu_model = AgglomerativeClustering(para) self.para = para self.clu_model.fit(x) def predict(self,x): return self.clu_model.predict(x)
full_data = np.empty((150, 3), dtype='float32') for i in range(150): full_data[i, 0] = x[i, 0] full_data[i, 1] = x[i, 0] full_data[i, 2] = y[i] np.random.shuffle(full_data) training_data = full_data[:130, :] testing_data = full_data[130:, :] model = KMeans() model.fit(training_data[:, :2], training_data[:, 2]) test_results = np.array( [model.predict(i[:2].reshape(1, -1)) for i in testing_data], dtype='float32') correct = 0 for idx, _ in enumerate(test_results): if test_results[idx] == testing_data[idx, 2]: correct += 1 print('model Accuracy:\t{}'.format(correct / len(test_results))) plt.scatter(x[:, 0], x[:, 1], c=y) plt.show() test_results = np.array( [model.predict(i[:2].reshape(1, -1)) for i in full_data], dtype='float32') plt.scatter(x[:, 0], x[:, 1], c=y)
class ClusteringSegmentation: def __init__(self, algorithm, n_clusters): """ Initialize a clustering segmentation object. Args: algorithm: the algorithm to use for clustering segmentation (kmeans and em) n_clusters: the number of clusters """ self.algorithm = algorithm.lower() self.n_clusters = n_clusters if self.algorithm == 'kmeans': self.model = KMeans(n_clusters=self.n_clusters, max_iter=300, tol=0.0001) elif self.algorithm == 'gmm': self.model = GaussianMixture(n_components=self.n_clusters, covariance_type='full', tol=0.0001, reg_covar=1e-06, max_iter=300) elif self.algorithm == "affinity": self.model = AffinityPropagation(affinity='euclidean', convergence_iter=15, damping=0.5, max_iter=200, preference=None, verbose=False) elif self.algorithm == 'aglo': pass elif self.algorithm == 'spectral': pass else: raise Exception("Algorithm is not yet implemented") def fit(self, image): """ Compute parameters of the model. Args: image: a ndarray representing an image (x, y, color_dimension) Returns: Nothing """ f_dim = image.shape[-1] if len(image.shape) > 2 else 1 X = image.reshape(-1, f_dim) if self.algorithm == 'aglo': connectivity = img_to_graph(image) self.model = AgglomerativeClustering(n_clusters=self.n_clusters, affinity='euclidean', connectivity=connectivity, compute_full_tree=False, linkage='average') elif self.algorithm == 'spectral': return None self.model.fit(X) def predict(self, image): """ Predict the cluster of each pixel of an image. Args: X: an ndarray representing an image (x, y, color_dim) Returns: an ndarray representing the image (x, y, cluster) """ f_dim = image.shape[-1] if len(image.shape) > 2 else 1 X = image.reshape(-1, f_dim) if self.algorithm == 'spectral': graph = img_to_graph(image) graph.data = np.exp(-graph.data / graph.data.std()) X_clustered = spectral_clustering(graph, n_clusters=self.n_clusters, eigen_solver='arpack') else: X_clustered = self.model.predict(X) return X_clustered.reshape(image.shape[0], image.shape[1]) def fit_predict(self, image): """ Compute parameters of the model and predict the cluster of each pixel of an image. Args: image: an ndarray representing an image (x, y, color_dim) Returns: an ndarray representing the image (x, y, cluster) """ self.fit(image) return self.predict(image)
def train_cluster(data_type=0, dimension_reduction=0, cluster_way=0, n_components=50, threshold=2, n_clusters=210, branching_factor=50, linkage=0, max_iter=500, eps=1.0): if data_type == 0: train_data = load_stage2_tf_idf("") elif data_type == 1: train_data = load_stage2_tf_idf("") nn_data = load_nn_stage2_features() train_data = pd.merge(train_data, nn_data, 'left', on="file_name") elif data_type == 2: train_data = load_nn_stage2_features() elif data_type == 3: train_data = load_stage2_tf_idf("1000") nn_data = load_nn_stage2_features() train_data = pd.merge(train_data, nn_data, 'left', on="file_name") dll = load_stage2_tf_idf("_dll") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "first") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "last") train_data = pd.merge(train_data, dll, 'left', on="file_name") train_data.fillna(0, inplace=True) elif data_type == 4: train_data = load_stage2_tf_idf("1000") nn_data = load_nn_stage2_features() train_data = pd.merge(train_data, nn_data, 'left', on="file_name") dll = load_stage2_tf_idf("_dll") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "first") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "last") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_clustering_statics_files() train_data = pd.merge(train_data, dll, 'left', on="file_name") train_data.fillna(0, inplace=True) file_name = train_data["file_name"] train_data.drop(columns=["file_name"], inplace=True) X = StandardScaler(with_mean=False).fit_transform(train_data) origin_data = X if dimension_reduction == 0: pass elif dimension_reduction == 1: model = IncrementalPCA(n_components=n_components) X = model.fit_transform(X) elif dimension_reduction == 2: model = NMF(n_components=n_components, init='random', random_state=0, max_iter=max_iter) X = model.fit_transform(X) elif dimension_reduction == 3: model = PCA(n_components=n_components) X = model.fit_transform(X) print(len(X[0])) if cluster_way == 0: mode = ["ward", "complete", "average", "single"] db = AgglomerativeClustering(n_clusters=n_clusters, linkage=mode[linkage]).fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join( "predictions", "aggcl" + "_" + str(n_clusters) + "_" + str(data_type) + "_" + str(dimension_reduction) + "_" + str(n_components) + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 1: db = Birch(branching_factor=branching_factor, n_clusters=n_clusters, threshold=threshold).fit(X) labels = db.predict(X) pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join("predictions", "birch" + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 2: db = hdbscan.HDBSCAN(min_cluster_size=40) db.fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join("predictions", "hdb_40" + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 3: db = DBSCAN(eps=eps, n_jobs=-1).fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join( "predictions", "db" + "_" + str(eps) + "_" + str(dimension_reduction) + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 4: labels = np.zeros((len(file_name), )) pd.DataFrame(data={ "id": file_name, "family_id": np.zeros((len(file_name), )) }).to_csv(os.path.join("predictions", "zeros" + ".csv"), index=False) elif cluster_way == 5: db = KMeans(n_clusters=n_clusters, random_state=0).fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join("predictions", "kmeans" + str(n_clusters) + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 6: db = AffinityPropagation() # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) scores = evaluate_cluster_performance(origin_data, labels) evaluate_cluster_performance(X, labels) return scores
class geodata: ''' a dedicated class to geolocation data ( longitude , latitude ) ''' def __init__(self, X, map_path=None): ''' X : data array of shape ( N , 2 ) where columns are : ( longitude , latitude ) map_path : file containing the map image on which we will scatter/plot our geodata ''' self.X = X self.map = plt.imread(map_path) if (map_path is not None) else None self.model = None self.n_clusters = 2 # box frame for the map to fit the background correctly : ( left , right , bottom , top ) h = 0.005 self.box = X[:, 0].min() - h, X[:, 0].max() + h, X[:, 1].min( ) - h, X[:, 1].max() + h # getters def getModel(self): return self.model def apply_clustering(self, model="kmeans", K=2, random_seed=0): ''' :param model: clustering model ( kmeans | spectral | agglo ) , agglo stands for agglomerative :param K: number of clusters :param random_seed: random seed for random numbers generations ''' # Keep K between 1 and N ( number of examples ) if (K > self.X.shape[0]): K = self.X.shape[0] elif (K < 1): K = 1 self.n_clusters = K if (model == "kmeans"): self.model = KMeans(n_clusters=K, random_state=random_seed).fit(self.X) elif (model == "spectral"): self.model = SpectralClustering(n_clusters=K, random_state=random_seed, affinity="laplacian").fit(self.X) elif (model == "agglo"): self.model = AgglomerativeClustering(n_clusters=K, linkage='complete').fit( self.X) else: raise Exception( "the clustering model should be 'kmeans'|'spectral'|'agglo' or None for no clustering and nothing else " ) def plot_data(self, plot_type="scatter", map_transparency=0.4, figsize=(16, 10), save=True, map_name="Brisbane"): ''' :param plot_type : - will have no effect if clustering is None - determines type of the clustering plot "scatter" | "regions" | "distances" :param map_transparency : float in range (0,1) , to determine the transparency :param figsize: the plot figure size , a tuple of int ( width , height ) proportion :param save: a boolean to choose to save the figures outputs on './outputs/specific_name.png' or not :param map_name: tha name of the map appearing on the figures title it plots the geodata according to the previous params ''' ########### general plots configuration for all types of plots ############## fig, ax = plt.subplots(figsize=figsize) # => customized title map_str = " " if (map_name is None) else "on '" + map_name + "' map" plot_str = " simple plot " if ( self.model is None) else " clustering plot " model_str = "" if ( self.model is None ) else " using " + self.model.__class__.__name__ + " with n_clusters = " + str( self.n_clusters) title = 'Geolocation data' + plot_str + map_str + model_str ax.set_title(title) ax.set_ylabel('Latitude') ax.set_xlabel('Longitude') # => background Map if (map_transparency is not None and map_transparency > 0): # alpha for transparency ax.imshow(self.map, extent=self.box, alpha=map_transparency, aspect='auto') ###################### plot different types of visualizations ######################## # simple scatter if (self.model is None): ax.scatter(self.X[:, 0], self.X[:, 1]) else: # labels if hasattr(self.model, 'labels_'): labels = self.model.labels_.astype(np.int) else: labels = self.model.predict(self.X) # preparing colors etc ... if (self.n_clusters <= 5): colors = [ '#0000ff', '#ff3300', '#00cc66', '#cc0099', '#00ffcc' ] else: # generate random colors for n_clusters colors = np.random.rand(self.n_clusters, 3) if (plot_type == "scatter"): for cluster, color in zip(range(self.n_clusters), colors): ax.scatter(self.X[labels == cluster, 0], self.X[labels == cluster, 1], color=color) if isinstance(self.model, KMeans): centroids = self.model.cluster_centers_ ax.scatter(centroids[cluster, 0], centroids[cluster, 1], color=color, marker="o", edgecolors="black", s=300) elif (plot_type == "regions"): if not isinstance(self.model, KMeans): raise Exception( "It is impossible to plot regions for this model , we cannot retrieve the cluster membership of new data " ) # PLOT REGIONS h = (self.box[1] - self.box[0]) / 100 xx, yy = np.meshgrid(np.arange(self.box[0], self.box[1], h), np.arange(self.box[2], self.box[3], h)) # Obtain labels for each point in mesh. Use last trained model. Z = self.model.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) my_color_map = LinearSegmentedColormap.from_list( "my_color_map", colors) plt.imshow(Z, interpolation='nearest', extent=self.box, cmap=my_color_map, aspect='auto', origin='lower', alpha=0.4) # scatter points for cluster, color in zip(range(self.n_clusters), colors): ax.scatter(self.X[labels == cluster, 0], self.X[labels == cluster, 1], cmap=my_color_map) # scatter centroids if Kmeans if isinstance(self.model, KMeans): centroids = self.model.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=400, linewidths=5, color='black', zorder=10) elif (plot_type == "distances"): if isinstance(self.model, KMeans): centroids = self.model.cluster_centers_ for cluster, color in zip(range(len(centroids)), colors): ax.scatter(self.X[labels == cluster, 0], self.X[labels == cluster, 1], color=color) ax.scatter(centroids[cluster, 0], centroids[cluster, 1], color=color, marker="o", edgecolors="blue", s=300) for x in self.X[labels == cluster]: plt.plot([centroids[cluster, 0], x[0]], [centroids[cluster, 1], x[1]], color=color) else: raise Exception( " distances plot requires clustering to be 'kmeans' to plot distances to centroids " ) ################################ save figures ####################################### if (save == True): if (not os.path.isdir("./outputs")): os.makedirs("./outputs/") plt.savefig('outputs/' + title + '.png') plt.show()
def agglomerative(self, score_df, col_name): aggromerative = AgglomerativeClustering(n_clusters = self.clust_num, affinity=self.affinity, linkage=self.linkage) aggromerative.fit(score_df.col_name) res_clusters = aggromerative.predict(score_df.col_name) return res_clusters
#print(Y.head()) scaler = Normalizer().fit(X) trainX = scaler.transform(X) traindata = np.array(X) trainlabel = np.array(Y) traindata, testdata, trainlabel, testlabel = model_selection.train_test_split( traindata, trainlabel, test_size=0.3) #print(testdata.shape) #print(traindata.shape) model = KNeighborsClassifier() model.fit(traindata, trainlabel) print(model) # make predictions expected = testlabel predicted = model.predict(testdata) #np.savetxt('res/predictedKNN.txt', predicted, fmt='%01d') # summarize the fit of the model accuracy = accuracy_score(expected, predicted) recall = recall_score(expected, predicted, average="binary") precision = precision_score(expected, predicted, average="binary") f1 = f1_score(expected, predicted, average="binary") cm = metrics.confusion_matrix(expected, predicted) print(cm) tpr = float(cm[0][0]) / np.sum(cm[0]) fpr = float(cm[1][1]) / np.sum(cm[1]) print("%.3f" % tpr) print("%.3f" % fpr) print("Accuracy") print("%.3f" % accuracy)
def unsupervised_clu(feature, part, model_selection): if part: if feature == 'graph': docFeature = json.loads( open('rmMultiPart1WOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads(open('rmMultiPart1Doc2vec.json').read()) if feature == 'comb': walk = json.loads(open('rmMultiPart1WOZeroGraph.json').read()) dv = json.loads(open('rmMultiPart1Doc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('rmMultiPart1CluInd.json').read()) num_clu = len(groundTruth) # number of clusters in each part else: rmMulti = True # False # if rmMulti: if feature == 'graph': docFeature = json.loads( open('rmMultiCluDatabaseWOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads( open('rmMultiCluDatabaseDoc2vec.json').read()) if feature == 'comb': walk = json.loads( open('rmMultiCluDatabaseWOZeroGraph.json').read()) dv = json.loads(open('rmMultiCluDatabaseDoc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('rmMultiGroundTruth.json').read()) num_clu = len( groundTruth ) # number of clusters after removing documents appearing multi-cluster, #doc = 1274 (3 all 0s for walk) else: if feature == 'graph': docFeature = json.loads( open('cluDatabaseWOZeroGraph.json').read()) if feature == 'doc2vec': docFeature = json.loads(open('cluDatabaseDoc2vec.json').read()) if feature == 'comb': walk = json.loads(open('cluDatabaseWOZeroGraph.json').read()) dv = json.loads(open('cluDatabaseDoc2vec.json').read()) docFeature = {} for doc in walk: val = walk[doc] + dv[doc] docFeature[doc] = val groundTruth = json.loads(open('groundTruth.json').read()) num_clu = len( groundTruth ) # number of clusters before removing documents appearing multi-cluster, #doc = 1393 (3 all 0s for walk) features = docFeature.values() if model_selection == 'AC': model = AC(n_clusters=num_clu, affinity='cosine', linkage='average') if model_selection == 'SC': model = SC(n_clusters=num_clu, affinity='cosine') if model_selection == 'GMM': model = GMM(n_components=num_clu, covariance_type='full') if model_selection == 'KMeans': model = KMeans(n_clusters=num_clu) if model_selection == 'GM': model = GM(n_components=num_clu) model.fit(features) res = model.predict(features) else: res = model.fit_predict(features) resDic = {} for i in range(len(res)): if not resDic.has_key(res[i]): resDic[res[i]] = [] resDic[res[i]].append(int(docFeature.keys()[i])) else: resDic[res[i]].append(int(docFeature.keys()[i])) result = resDic.values() return (result, groundTruth)
hier_aa) # In[137]: from sklearn import decomposition import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D #3.将结果可视化,方便观测结果。 #通过PCA将4维数据降到2维来展示效果 X, y = prepare_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = KMeans(n_clusters=3).fit(X_train) pca = decomposition.PCA(n_components=2) X = pca.fit_transform(X_test) pos = pd.DataFrame() plt.scatter(X[:, 0], X[:, 1], c=model.predict(X_test), s=50, cmap='rainbow') # In[136]: #通过Axes3D来展示效果 fig = plt.figure(1, figsize=(4, 3)) ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134) ax.scatter(X_train.values[:, 0], X_train.values[:, 1], X_train.values[:, 2], c=model.predict(X_train), cmap='rainbow') ax.w_xaxis.set_ticklabels([]) ax.w_yaxis.set_ticklabels([]) ax.w_zaxis.set_ticklabels([])
class evaluate: def __init__(self, estimator_label, config, failed_file=False): self.estimator_label = estimator_label self.config = config self.loaded = self.load_estimator() self.res = {} if not failed_file: self.failed = open(estimator_label + "_failed.txt", mode="a") self.failed.flush() def run_all(self, path="./Datasets/processed/", verbose=False, nmi=False): if os.path.exists(path): allFiles = glob.glob(path + "*.csv") count_load = 0 count_train = 0 count_test = 0 for dfile in allFiles: try: data = pd.read_csv(dfile, header=None, na_values='?') self.y = data.iloc[:, -1] self.data = data.iloc[:, :-1] filename_w_ext = os.path.basename(dfile) print(filename_w_ext) filename, file_extension = os.path.splitext(filename_w_ext) self.data_label = filename count_load += 1 if verbose: print("loaded " + str(count_load) + " out of " + str(len(allFiles))) if self.data.isnull().values.any(): imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp = imp.fit(self.data) self.data = pd.DataFrame(imp.transform(self.data)) except: print("couldn't load " + dfile) if self.loaded: try: self.fit_data() except: continue if len(set(list(self.estimator.labels_))) == 1: continue count_train += 1 if verbose: print("fitted " + str(count_load) + " out of " + str(len(allFiles))) if len(list(set(self.estimator.labels_))) / len( self.data) > 0.75: continue try: Metric = self.eval_metrics(nmi) self.res[self.data_label] = Metric count_test += 1 if verbose: print("evaluated " + str(count_load) + " out of " + str(len(allFiles))) except: print("evaluation problem", self.data_label, self.config) self.failed.write( str(self.data_label) + " " + str(self.config)) self.failed.write("\n") self.failed.flush() else: print("model loading failed") return False else: print(path + " doesn't exist") return False return True def load_estimator(self): if self.estimator_label.lower() == "kmeans": self.estimator = KMeans(init=self.config['init'], n_clusters=self.config['n_clusters'], algorithm=self.config["algorithm"], n_init=self.config['n_init'], max_iter=self.config["max_iter"]) self.estimator_label = "kmeans" return True elif self.estimator_label.lower() == "meanshift": self.estimator = MeanShift(cluster_all=self.config["cluster_all"], bin_seeding=self.config["bin_seeding"], n_jobs=self.config["n_jobs"]) return True elif self.estimator_label.lower() == "dbscan": self.estimator = DBSCAN(leaf_size=self.config["leaf_size"], metric=self.config["metric"], eps=self.config["eps"], min_samples=self.config["min_samples"]) return True elif self.estimator_label.lower() == "affinitypropagation": self.estimator = AffinityPropagation( damping=self.config["damping"], convergence_iter=self.config["convergence_iter"], max_iter=self.config["max_iter"]) return True elif self.estimator_label.lower() == "spectralclustering": self.estimator = SpectralClustering( n_clusters=self.config['n_clusters'], eigen_solver=self.config["eigen_solver"], affinity=self.config['affinity'], assign_labels=self.config["assign_labels"]) return True elif self.estimator_label.lower() == "birch": self.estimator = Birch( n_clusters=self.config['n_clusters'], threshold=self.config["threshold"], branching_factor=self.config['branching_factor']) return True elif self.estimator_label.lower() == "optics": self.estimator = OPTICS( min_samples=self.config['min_samples'], cluster_method=self.config["cluster_method"], p=self.config['p'], n_jobs=self.config["n_jobs"]) return True elif self.estimator_label.lower() == "gaussian": self.estimator = GaussianMixture( n_init=self.config['n_init'], init_params=self.config["init_params"], n_components=self.config['n_components'], covariance_type=self.config["covariance_type"]) return True elif self.estimator_label.lower() == "agglomerativeclustering": self.estimator = AgglomerativeClustering( n_clusters=self.config['n_clusters'], linkage=self.config["linkage"]) return True else: print("couldn't load model", self.estimator_label) return False def fit_data(self): self.estimator.fit(self.data) def predict_data(self): self.estimator.predict(self.data) def eval_metrics(self, nmi=False): if nmi: Metrics = {} Metrics["nmi"] = metrics.normalized_mutual_info_score( self.y, self.estimator.labels_) return Metrics sample_size = int(len(self.data) * 0.1) if sample_size < 100: sample_size = len(self.data) v = Validation( np.asmatrix(self.data).astype(np.float), list(self.estimator.labels_)) Metrics = v.run_all() try: Ix = metric(self.data, self.estimator.labels_, self.estimator.cluster_centers_) Metrics["IIndex"] = Ix.IIndex() except: Metrics["IIndex"] = "none" try: sdbw_c = sdbw(self.data, self.estimator.labels_, self.estimator.cluster_centers_) Metrics["SDBW"] = sdbw_c.sdbw_score() except: Metrics["SDBW"] = "none" Metrics["ari"] = 0.0 Metrics["ami"] = 0.0 Metrics["nmi"] = metrics.normalized_mutual_info_score( self.y, self.estimator.labels_) Metrics["v_measure"] = 0.0 try: Metrics["silhouette_score"] = metrics.silhouette_score( self.data, self.estimator.labels_, metric='euclidean', sample_size=sample_size, random_state=0) except: Metrics["silhouette_score"] = 0.0 try: Metrics[ "calinski_harabasz_score"] = metrics.calinski_harabasz_score( self.data, self.estimator.labels_) except: Metrics["calinski_harabasz_score"] = 0.0 ''' sample_size=int(len(self.data)*0.1) if sample_size<100: sample_size=len(self.data) Metrics={} Metrics["silhouette_score"] = metrics.silhouette_score(self.data, self.estimator.labels_, metric='euclidean', sample_size=sample_size,random_state=0) Metrics["calinski_harabasz_score"]= metrics.calinski_harabasz_score(self.data, self.estimator.labels_) Metrics["davies_bouldin_score"]=metrics.davies_bouldin_score(self.data, self.estimator.labels_) if self.estimator_label.lower()=="meanshift": Metrics["SSE"]=len(self.estimator.cluster_centers_) if self.estimator_label.lower()=="kmeans": araujo = metric(self.data, self.estimator.labels_, self.estimator.cluster_centers_) Metrics["IIndex"] = 0# araujo.IIndex() Metrics["SSE"]=self.estimator.inertia_ Metrics["nSSE"]=self.estimator.inertia_/(len(self.data)*len(self.data.columns)) labels_true=self.y labels_true=np.array(labels_true) Metrics["ARI"]=metrics.adjusted_rand_score(labels_true, self.estimator.labels_) Metrics["MIS"]=metrics.adjusted_mutual_info_score(labels_true, self.estimator.labels_) Metrics["v_measure"]=metrics.v_measure_score(labels_true, self.estimator.labels_) else: #Metrics["SSE"] = -1 Metrics["nSSE"] = -1 ''' return Metrics
def main(): # parse command-line arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', required=True, default='input/', help="string, path to the input folder with the expression data, " "default 'input/'") parser.add_argument('-ilr', '--input_lr', required=False, default='input/', help="string, optional, path to the input folder with the ligands and " "receptors list, default 'input/'") parser.add_argument('-o', '--output', required=True, default='output/', help="string, path to the output folder, default 'output/'") parser.add_argument('-d', '--dataType', required=True, default='merfish', choices=['merfish', 'merfish_cell_line', 'starmap'], help="string, type of expression data, 'merfish' for MERFISH hypothalamus data, " "'merfish_cell_line' for MERFISH U-2 OS cells, 'starmap' for 'STARmap mPFC cells';" "default 'merfish'") parser.add_argument('-g', '--gender', required=True, default='Female', help="string, gender of input animal sample, default 'Female', put 'na' if " "not available") parser.add_argument('-b', '--behavior', required=True, default='Naive', help="string, behavior of input animal sample, default 'Naive', put 'na' if " "not available") parser.add_argument('-c', '--cellType', required=True, default='Excitatory', help="string, cell type that will be built a model for, " "use \\ for white-space, e.g. 'OD\ Mature\ 2', default 'Excitatory'") parser.add_argument('-m', '--mode', required=True, default='train', help="string, any of 'train', 'CV'; if 'train', then all data will be used " "for training and output a pickle file for learned parameters; if 'CV', " "then cross-validation will be conducted each time with an animal/sample " "left out and each CV run output a pickle file and prediction result, " "default 'train'") parser.add_argument('-c1', '--numLevel1', required=False, default=1, help="integer, optional, number of classes at level 1, number of experts = " "number of classes " "at level 1 x number of classes at level 2, default 1") parser.add_argument('-c2', '--numLevel2', required=False, default=5, help="integer, optional, number of classes at level 2, default 5") parser.add_argument('-e', '--epochs', required=False, default=20, help="integer, optional, number of epochs to train MESSI, default 20") parser.add_argument('-gs', '--grid_search', required=False, type=str2bool, default=False, help="boolean, optional, if conduct grid search for hyper-parameters, " "default False") parser.add_argument('-ns', '--n_sets', required=False, default=3, help="integer, optional, number of CV sets for grid search, default 3") parser.add_argument('-r', '--numReplicates', required=False, default=1, help="integer, optional, number of times to run with same set of parameters, " "default 1") parser.add_argument('-p', '--preprocess', required=False, default='neighbor_cat', help="string, optional, the way to include neighborhood information; " "neighbor_cat: include by concatenating them to the cell own " "features; neighbor_sum: include by addinding to the cell own " "features; anything without 'neighbor': no neighborhood " "information will be used as features; 'baseline': only baseline " "features; default 'neighbor_cat'") parser.add_argument('-tr', '--topKResponses', required=False, default=None, help='integer, optional, number of top dispersed responses genes to model,' 'default None (to include all response genes)') parser.add_argument('-ts', '--topKSignals', required=False, default=None, help='integer, optional, number of top dispersed signalling genes to use as ' 'features, default None (to include all signalling genes)') # parser.add_argument('-rp', '--responsePrior', required=False, # default=None, help='string, optional, path to the response genes to be used, default None') # parser.add_argument('-sp', '--signalsPrior', required=False, # default=None, help='string, optional, path to the signalling genes to be used, default None') args = parser.parse_args() print(args) # set parameters for data input_path = args.input input_path_lr = args.input_lr output_path = args.output data_type = args.dataType sex = args.gender behavior = args.behavior behavior_no_space = behavior.replace(" ", "_") current_cell_type = args.cellType current_cell_type_no_space = current_cell_type.replace(" ", "_") # set parameters for model mode = args.mode grid_search = args.grid_search n_sets = int(args.n_sets) n_classes_0 = int(args.numLevel1) n_classes_1 = int(args.numLevel2) n_epochs = int(args.epochs) n_replicates = int(args.numReplicates) # set parameters for data processing preprocess = args.preprocess if args.topKResponses is not None: top_k_response = int(args.topKResponses) else: top_k_response = args.topKResponses if args.topKSignals is not None: top_k_regulator = int(args.topKSignals) else: top_k_regulator = args.topKSignals response_type = 'original' # use raw values to fit the model if grid_search: condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_grid_search" else: condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_{n_classes_1}" # prepare to read data read_in_functions = {'merfish': [read_meta_merfish, read_merfish_data, get_idx_per_dataset_merfish], 'merfish_cell_line': [read_meta_merfish_cell_line, read_merfish_cell_line_data, get_idx_per_dataset_merfish_cell_line], 'starmap': [read_meta_starmap_combinatorial, read_starmap_combinatorial, get_idx_per_dataset_starmap_combinatorial]} # set data reading functions corresponding to the data type if data_type in ['merfish', 'merfish_cell_line', 'starmap']: read_meta = read_in_functions[data_type][0] read_data = read_in_functions[data_type][1] get_idx_per_dataset = read_in_functions[data_type][2] else: raise NotImplementedError(f"Now only support processing 'merfish', 'merfish_cell_line' or 'starmap'") # read in ligand and receptor lists l_u, r_u = get_lr_pairs(input_path=input_path_lr) # may need to change to the default value # read in meta information about the dataset meta_all, meta_all_columns, cell_types_dict, genes_list, genes_list_u, \ response_list_prior, regulator_list_prior = \ read_meta(input_path, behavior_no_space, sex, l_u, r_u) # TO BE MODIFIED: number of responses # get all available animals/samples all_animals = list(set(meta_all[:, meta_all_columns['Animal_ID']])) for _z in range(len(all_animals)): if mode == 'train': # only run once if _z == 0: test_animal = '' else: break else: test_animal = all_animals[_z] samples_test = np.array([test_animal]) samples_train = np.array(list(set(all_animals) - {test_animal})) print(f"Test set is {samples_test}") print(f"Training set is {samples_train}") bregma = None # ------ read data ------ idx_train, idx_test, idx_train_in_general, \ idx_test_in_general, idx_train_in_dataset, \ idx_test_in_dataset, meta_per_dataset_train, \ meta_per_dataset_test = find_idx_for_train_test(samples_train, samples_test, meta_all, meta_all_columns, data_type, current_cell_type, get_idx_per_dataset, return_in_general=False, bregma=bregma) # TBD: the current approach uses a lot memory; data_sets = [] for animal_id, bregma in meta_per_dataset_train: hp, hp_cor, hp_genes = read_data(input_path, bregma, animal_id, genes_list, genes_list_u) if hp is not None: hp_columns = dict(zip(hp.columns, range(0, len(hp.columns)))) hp_np = hp.to_numpy() else: hp_columns = None hp_np = None hp_cor_columns = dict(zip(hp_cor.columns, range(0, len(hp_cor.columns)))) hp_genes_columns = dict(zip(hp_genes.columns, range(0, len(hp_genes.columns)))) data_sets.append([hp_np, hp_columns, hp_cor.to_numpy(), hp_cor_columns, hp_genes.to_numpy(), hp_genes_columns]) del hp, hp_cor, hp_genes datasets_train = data_sets data_sets = [] for animal_id, bregma in meta_per_dataset_test: hp, hp_cor, hp_genes = read_data(input_path, bregma, animal_id, genes_list, genes_list_u) if hp is not None: hp_columns = dict(zip(hp.columns, range(0, len(hp.columns)))) hp_np = hp.to_numpy() else: hp_columns = None hp_np = None hp_cor_columns = dict(zip(hp_cor.columns, range(0, len(hp_cor.columns)))) hp_genes_columns = dict(zip(hp_genes.columns, range(0, len(hp_genes.columns)))) data_sets.append([hp_np, hp_columns, hp_cor.to_numpy(), hp_cor_columns, hp_genes.to_numpy(), hp_genes_columns]) del hp, hp_cor, hp_genes datasets_test = data_sets del data_sets # ------ pre-processing ------- # construct neighborhood graph if data_type == 'merfish_RNA_seq': neighbors_train = None neighbors_test = None else: if data_type == 'merfish': dis_filter = 100 else: dis_filter = 1e9 neighbors_train = get_neighbors_datasets(datasets_train, "Del", k=10, dis_filter=dis_filter, include_self=False) neighbors_test = get_neighbors_datasets(datasets_test, "Del", k=10, dis_filter=dis_filter, include_self=False) # set parameters for different feature types lig_n = {'name': 'regulators_neighbor', 'helper': preprocess_X_neighbor_per_cell, 'feature_list_type': 'regulator_neighbor', 'per_cell': True, 'baseline': False, 'standardize': True, 'log': True, 'poly': False} rec_s = {'name': 'regulators_self', 'helper': preprocess_X_self_per_cell, 'feature_list_type': 'regulator_self', 'per_cell': True, 'baseline': False, 'standardize': True, 'log': True, 'poly': False} lig_s = {'name': 'regulators_neighbor_self', 'helper': preprocess_X_self_per_cell, 'feature_list_type': 'regulator_neighbor', 'per_cell': True, 'baseline': False, 'standardize': True, 'log': True, 'poly': False} type_n = {'name': 'neighbor_type', 'helper': preprocess_X_neighbor_type_per_dataset, 'feature_list_type': None, 'per_cell': False, 'baseline': False, 'standardize': True, 'log': False, 'poly': False} base_s = {'name': 'baseline', 'helper': preprocess_X_baseline_per_dataset, 'feature_list_type': None, 'per_cell': False, 'baseline': True, 'standardize': True, 'log': False, 'poly': False} if data_type == 'merfish_cell_line': feature_types = [lig_n, rec_s, base_s, lig_s] else: feature_types = [lig_n, rec_s, type_n, base_s, lig_s] # untransformed features X_trains, X_tests, regulator_list_neighbor, regulator_list_self = prepare_features(data_type, datasets_train, datasets_test, meta_per_dataset_train, meta_per_dataset_test, idx_train, idx_test, idx_train_in_dataset, idx_test_in_dataset, neighbors_train, neighbors_test, feature_types, regulator_list_prior, top_k_regulator, genes_list_u, l_u, r_u, cell_types_dict) total_regulators = regulator_list_neighbor + regulator_list_self log_response = True # take log transformation of the response genes Y_train, Y_train_true, Y_test, Y_test_true, response_list = prepare_responses(data_type, datasets_train, datasets_test, idx_train_in_general, idx_test_in_general, idx_train_in_dataset, idx_test_in_dataset, neighbors_train, neighbors_test, response_type, log_response, response_list_prior, top_k_response, genes_list_u, l_u, r_u) if grid_search: X_trains_gs = copy.deepcopy(X_trains) Y_train_gs = copy.copy(Y_train) # transform features transform_features(X_trains, X_tests, feature_types) print(f"Minimum value after transformation can below 0: {np.min(X_trains['regulators_self'])}") # combine different type of features if data_type == 'merfish': num_coordinates = 3 elif data_type == 'starmap' or data_type == 'merfish_cell_line': num_coordinates = 2 else: num_coordinates = None if np.ndim(X_trains['baseline']) > 1 and np.ndim(X_tests['baseline']) > 1: X_train, X_train_clf_1, X_train_clf_2 = combine_features(X_trains, preprocess, num_coordinates) X_test, X_test_clf_1, X_test_clf_2 = combine_features(X_tests, preprocess, num_coordinates) elif np.ndim(X_trains['baseline']) > 1: X_train, X_train_clf_1, X_train_clf_2 = combine_features(X_trains, preprocess, num_coordinates) print(f"Dimension of X train is: {X_train.shape}") if mode == 'CV': print(f"Dimension of X test is: {X_test.shape}") # ------ modeling by MESSI ------ for _i in range(0, n_replicates): # ------ set parameters ------ model_name_gates = 'logistic' model_name_experts = 'mrots' soft_weights = True partial_fit_expert = True # specify default parameters for MESSI model_params = {'n_classes_0': n_classes_0, 'n_classes_1': n_classes_1, 'model_name_gates': model_name_gates, 'model_name_experts': model_name_experts, 'num_responses': Y_train.shape[1], 'soft_weights': soft_weights, 'partial_fit_expert': partial_fit_expert, 'n_epochs': n_epochs, 'tolerance': 3} print(f"Model parameters for training is {model_params}") # set up directory for saving the model sub_condition = f"{condition}_{model_name_gates}_{model_name_experts}" sub_dir = f"{data_type}/{behavior_no_space}/{sex}/{current_cell_type_no_space}/{preprocess}/{sub_condition}" current_dir = os.path.join(output_path, sub_dir) if not os.path.exists(current_dir): os.makedirs(current_dir) print(f"Model and validation results (if applicable) saved to: {current_dir}") if mode == 'CV': suffix = f"_{test_animal}_{_i}" else: suffix = f"_{_i}" if grid_search: # prepare input meta data if data_type == 'merfish': meta_per_part = [tuple(i) for i in meta_per_dataset_train] meta_idx = meta2idx(idx_train_in_dataset, meta_per_part) else: meta_per_part, meta_idx = combineParts(samples_train, datasets_train, idx_train_in_dataset) # prepare parameters list to be tuned if data_type == 'merfish_cell_line': current_cell_type_data = 'U-2_OS' elif data_type == 'starmap': current_cell_type_data = 'STARmap_excitatory' else: current_cell_type_data = current_cell_type params = {'n_classes_1': list(search_range_dict[current_cell_type_data]), 'soft_weights': [True, False], 'partial_fit_expert': [True, False]} keys, values = zip(*params.items()) params_list = [dict(zip(keys, v)) for v in itertools.product(*values)] new_params_list = [] for d in params_list: if d['n_classes_1'] == 1: if d['soft_weights'] and d['partial_fit_expert']: # n_expert = 1, soft or hard are equivalent new_params_list.append(d) else: if d['soft_weights'] == d['partial_fit_expert']: new_params_list.append(d) ratio = 0.2 # initialize with default values model_params_val = model_params.copy() model_params_val['n_epochs'] = 5 # increase for validation models to converge model_params_val['tolerance'] = 0 print(f"Default model parameters for validation {model_params_val}") model = hme(**model_params_val) gs = gridSearch(params, model, ratio, n_sets, new_params_list) gs.generate_val_sets(samples_train, meta_per_part) gs.runCV(X_trains_gs, Y_train_gs, meta_per_part, meta_idx, feature_types, data_type, preprocess) gs.get_best_parameter() print(f"Best params from grid search: {gs.best_params}") # modify the parameter setting for key, value in gs.best_params.items(): model_params[key] = value print(f"Model parameters for training after grid search {model_params}") filename = f"validation_results{suffix}.pickle" pickle.dump(gs, open(os.path.join(current_dir, filename), 'wb')) # ------ initialize the sample assignments ------ if grid_search and 'n_classes_1' in params: model = AgglomerativeClustering(n_clusters=gs.best_params['n_classes_1']) else: model = AgglomerativeClustering(n_classes_1) model = model.fit(Y_train) hier_labels = [model.labels_] model_params['init_labels_1'] = hier_labels # ------ construct MESSI ------ model = hme(**model_params) # train model.train(X_train, X_train_clf_1, X_train_clf_2, Y_train) if grid_search and 'n_classes_1' in params: model = AgglomerativeClustering(n_clusters=gs.best_params['n_classes_1']) else: model = AgglomerativeClustering(n_classes_1) model = model.fit(Y_train) hier_labels = [model.labels_] model_params['init_labels_1'] = hier_labels # ------ construct MESSI ------ model = hme(**model_params) # train model.train(X_train, X_train_clf_1, X_train_clf_2, Y_train) # save the model filename = f"hme_model{suffix}.pickle" pickle.dump(model, open(os.path.join(current_dir, filename), 'wb')) # predict the left-out animal if mode == 'CV': Y_hat_final = model.predict(X_test, X_test_clf_1, X_test_clf_2) mae = abs(Y_test - Y_hat_final).mean(axis=1).mean() print(f"Mean absolute value for {test_animal} is {mae}") filename = f"test_predictions_{test_animal}_{_i}" np.save(os.path.join(current_dir, filename), Y_hat_final)
predictedlabels = pd.DataFrame(predictedlabels) predictedlabels = predictedlabels.iloc[:-5895] score = silhouette_score(df, cluster.labels_, metric='euclidean') print('Silhouette Score - Agglomerative Clustering:') print(score) print() #applying MeanShift Clustering Algorithm cluster = MeanShift(bandwidth=2).fit(df) score = silhouette_score(df, cluster.labels_, metric='euclidean') print('Silhouette Score - MeanShift Clustering:') print(score) y_predicted = cluster.predict(test_data) accuracy = metrics.accuracy_score(y_test, y_predicted) print('Accuracy - MeanShift Clustering:') print(accuracy) print("Recall - MeanShift Clustering:") print( metrics.recall_score(y_test, y_predicted, average='macro', zero_division='warn')) print("Precision - MeanShift Clustering:") print(metrics.precision_score(y_test, y_predicted, average='macro')) print("F1 - MeanShift Clustering:") print( metrics.f1_score(y_test,
wspace=.2, hspace=.2) X = X_r_sales plot_num = 1 params = default_base.copy() for n_cluster in params['clusters_range']: ac = AgglomerativeClustering(n_clusters=n_cluster, linkage=lkg) t0 = time.time() ac.fit(X) t1 = time.time() if hasattr(ac, 'labels_'): y_pred = ac.labels_.astype(np.int) else: y_pred = ac.predict(X) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) n_cluster = len(set(y_pred)) - (1 if -1 in y_pred else 0) noise_rate = 0
# & applying Hierarchial clustering to a set of numbers for k dendrogram = hc.dendrogram(hc.linkage(X, method='ward', metric='euclidean')) plt.title('Dendrogram') plt.xlabel('') plt.ylabel('Distances') plt.show() ###################### 3- Training ###################### K = 2 model = AgglomerativeClustering(n_clusters=K, affinity='euclidean', linkage='ward') model.fit(X) ###################### 4- Testing ###################### y = model.predict(X) ###################### 5- Visualization ###################### ###### IMPORTANT NOTE: this visualization works for 2 dimensions only ###### colors = [ 'red', 'blue', 'lightcoral', 'indigo', 'gold', 'crimson', 'fuchsia', 'peru', 'palegreen', 'lawngreen', 'olivedrab', 'yellow', 'darkseagreen', 'tomato', 'orange', 'darkgreen', 'springgreen', 'darkred', 'teal', 'midnightblue', 'brown', 'gray', 'darkviolet', 'aqua', 'purple', 'orangered', 'turquoise', 'dodgerblue', 'deeppink' ] for i in range(K): plt.scatter(X[y == i, 0], X[y == i, 1], s=100, c=colors[i],
def cluster(features_map, clustering_algorithm, n_clusters, linkage='ward', affinity=None, eigen_solver=None, n_init=10, gamma=1.0, n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', eps=0.5, min_samples=5, algorithm='auto', p=None, compute_full_tree = True, random_state = None): if affinity is None: if clustering_algorithm == 0: affinity = 'euclidean' if clustering_algorithm == 1: affinity = 'rbf' # do clustering for every location we have in the dev set # read locations file df_locations = pd.read_csv(DATA_DIR + "poiNameCorrespondences.txt", sep="\t", header=None) # remove first column (names) locations = np.array(df_locations[1]) score = [] firstLoop = True for location in locations: # read the ground truth file for the images df_gt = pd.read_csv(DATA_DIR + GROUND_TRUTH_PATH + location + " dGT.txt", sep=",", header=None) # create a dictionary of the form { imageID : clusterID } truth = dict(zip(df_gt[0], df_gt[1])) # read in the image features features_df = [] # copy features map for every loop tmp_features_map = features_map if tmp_features_map >= ba.bitarray('1000000000'): # CM features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CM.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0111111111') if tmp_features_map >= ba.bitarray('0100000000'): # CM3x3 features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CM3x3.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0011111111') if tmp_features_map >= ba.bitarray('0010000000'): # CN features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CN.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0001111111') if tmp_features_map >= ba.bitarray('0001000000'): # CN3x3 features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CN3x3.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0000111111') if tmp_features_map >= ba.bitarray('0000100000'): # CSD features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " CSD.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0000011111') if tmp_features_map >= ba.bitarray('0000010000'): # GLRLM features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " GLRLM.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0000001111') if tmp_features_map >= ba.bitarray('0000001000'): # GLRLM3x3 features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " GLRLM3x3.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0000000111') if tmp_features_map >= ba.bitarray('0000000100'): # HOG features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " HOG.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0000000011') if tmp_features_map >= ba.bitarray('0000000010'): # LBP features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " LBP.csv", sep=",", header=None)) tmp_features_map = tmp_features_map & ba.bitarray('0000000001') if tmp_features_map >= ba.bitarray('0000000001'): # LBP3x3 features_df.append(pd.read_csv(DATA_DIR + FEATURE_PATH + location + " LBP3x3.csv", sep=",", header=None)) # read the ids into an array ids = np.array(features_df[0][0]) first = True for df_feature in features_df: # remove the first column with the image ids df_feature = df_feature.drop([0], axis=1) # create an array of all feautures [id,f1,f2,....] if first: features = np.array(df_feature) else: features = np.concatenate((features, np.array(df_feature)), axis=1) first = False # normalize every feature column features = (features - features.min(axis=1)[:,np.newaxis])/(features.max(axis=1)[:,np.newaxis]-features.min(axis=1)[:,np.newaxis]) # calculate pca components which are used instead of the real features pca = PCA(n_components=15) data = pca.fit_transform(features) # normalize the components columns data = (data - data.min(axis=1)[:,np.newaxis])/(data.max(axis=1)[:,np.newaxis]-data.min(axis=1)[:,np.newaxis]) # use feature array and number of clusters from above # use DBSCAN because it does not need the number of clusters if clustering_algorithm < 0: print("\n\nInvalid clustering algorithm: " + str(clustering_algorithm) + "!\n\n") return if clustering_algorithm == 0: model = AgglomerativeClustering(n_clusters=n_clusters, affinity=affinity, compute_full_tree=compute_full_tree, linkage=linkage) if clustering_algorithm == 1: model = SpectralClustering(n_clusters=n_clusters, eigen_solver=eigen_solver, random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity, n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels, n_jobs=1) if clustering_algorithm == 2: model = DBSCAN(eps=eps, min_samples=min_samples, algorithm=algorithm, p=p, n_jobs=1) if clustering_algorithm == 3: model = GaussianMixture(n_components=n_clusters) if clustering_algorithm > 3: print("\n\nInvalid clustering algorithm: " + str(clustering_algorithm) + "!\n\n") return # create dictionary { imageID, predictedCluster } if clustering_algorithm == 3: model.fit(data) prediction = dict(zip(ids, model.predict(data))) else: prediction = dict(zip(ids, model.fit_predict(data))) # there isn't a ground truth for each image, so we can use the subset for comparision # additionally the predictions are now in the same order as the truth values prediction_subset = {x: prediction[x] for x in truth.keys() if x in prediction} # calculate performance using adjusted rand score: ars = adjusted_rand_score(list(truth.values()), list(prediction_subset.values())) # move score from [-1;1] to [0;1] and add to score array score.append(ars / 2 + 0.5) # calculate statistics over all scores return {'min': min(score), 'mean': (sum(score)/len(score)), 'sd': np.std(score), 'median': st.median(score), 'max': max(score)}
Y_test = np.array([score]).reshape(cnt + 1, 1) else: X_test = np.concatenate( (X_test, np.array(weight[idx[doc]]).reshape(1, d)), axis=0).reshape(cnt + 1, d) Y_test = np.concatenate( (Y_test, np.array([score]).reshape(1, 1)), axis=0).reshape(cnt + 1, 1) cnt += 1 line = next(f) #call sklearn.Lasso() #clflasso = Lasso().fit(X_train, Y_train) print('predicting...') Y_hat = clfRand.predict(X_test) #predict Y_hat = Y_hat[:, np.newaxis] MAE = np.mean(np.abs(Y_hat - Y_test)) print('MAE: %f' % MAE) # print(Y_hat) for idx, doc in enumerate(test_set.keys()): if idx >= cnt: break res.write(QID + ' ' + doc + ' ') res.write(str(float(Y_hat[idx]))) res.write('\n') MAE_TOTAL += MAE / 50 print(QID + ' MAE: %f' % MAE) print('===================================\n') res.write('\n MAE: %f \n' % MAE)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25) undersample = RandomUnderSampler(random_state=0) X_train2, Y_train2 = undersample.fit_resample(X_train, Y_train) oversample = SMOTE() X_train3, Y_train3 = oversample.fit_resample(X_train,Y_train) cls = KNeighborsClassifier(n_neighbors=1, metric='kulsinski') l = [(X_train,Y_train),(X_train2,Y_train2),(X_train3,Y_train3)] for i,j in l: cls.fit(i,j) Y_pred = cls.predict(X_test) print(classification_report(Y_test,Y_pred)) import numpy as np import matplotlib.pyplot as plt import pandas as pd def plot_decision_boundaries(X, y, model_class, **model_params): """ Function to plot the decision boundaries of a classification model. This uses just the first two columns of the data for fitting the model as we need to find the predicted value for every point in scatter plot. Arguments: X: Feature data as a NumPy-type array. y: Label data as a NumPy-type array.
def plot_silhouette(df, X, n_clusters, model='KM'): ''' Plot silhouette sample scores for input dataframe. :param df: dataframe to cluster :param X: dense binary array for silhouette scoring :param n_clusters: number of clusters for model to cluster data into :param model: the clustering algorithm to be applied to the data, default = 'KM' (k-modes) :returns: None, saved plot of silhouette sample scores for each cluster ''' fig = plt.figure(figsize=(8,6)) ax = fig.add_subplot(111) ax.set_xlim([-0.6, 1]) # Insert blank space between silhouette plots of individual clusters ax.set_ylim([0, len(df) + (n_clusters + 1) * 10]) # Initialize clusterer and set random state, if possible if model == 'AG': clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average').fit(X) labels = clusterer.labels_ elif model == 'KM': clusterer = kmodes.KModes(n_clusters=n_clusters, n_init=3, init='Huang', verbose=1) labels = clusterer.fit_predict(df) elif model == 'GM': clusterer = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=20, n_init=50, random_state=42, verbose=1).fit(X) labels = clusterer.predict(X) # Compute the silhouette score (average value for all the samples) and the silhoutte score for each sample silhouette_avg = silhouette_score(X, labels, metric='hamming') sample_silhouette_values = silhouette_samples(X, labels, metric='hamming') y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to cluster i, and sort them ith_cluster_silhouette_values = \ sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) # Label the silhouette plots with their cluster numbers at the middle ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax.set_title('The silhouette plot for the various clusters') ax.set_xlabel('The silhouette coefficient values') ax.set_ylabel('Cluster label') # Add a vertical line for average silhoutte score of all values ax.axvline(x=silhouette_avg, color='red', linestyle='--') ax.set_yticks([]) # Clear the yaxis labels / ticks ax.set_xticks([-0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.title('Silhouette analysis for {} with {} clusters'.format(clusterer.__class__.__name__, n_clusters)) plt.savefig('sil_{}_{}.png'.format(clusterer.__class__.__name__, n_clusters), dpi=200) plt.close()
print(" répartition clusters avec K mean : ", df_kmean['label_cluster'].value_counts()) df_kmean.to_csv('fichier_kmean_7clust.csv',sep=',') '''---------------- Mise en place de l'arbre hiérarchique ascendant--------------- ''' for i in range(2,11): clust_ah=AgglomerativeClustering(n_clusters=i,affinity='euclidean',linkage='ward') clust=clust_ah.predict(matrice_norme) silhouette=silhouette_score(matrice_norme,clust) print(' pour ', i, ' clusters, on obtient un score silhouette de :', silhouette) cah=AgglomerativeClustering(n_clusters=7,affinity='euclidean',linkage='ward') cah.fit_predict(matrice_norme) ''' test du tracé du dendrogramme''' z=linkage(matrice, method='ward',metric='euclidean') plt.figure(figsize=(10, 10)) plt.title("Dendrogramme clustering clients")
data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1) X=data[:, 0:11] cluster=int(input("Input the number of clusters: ")) model = AgglomerativeClustering(n_clusters= cluster) model.fit(X) first=int(input("Input the number of the first wine: ")) second=int(input("Input the number of the second wine: ")) if model.labels_[first]== model.labels_[second]: print("Result : %d and %d are in the same cluster"%(first,second)) else: print("Result : %d and %d are in the different cluster"%(first,second)) if sel=='k': from sklearn.cluster import KMeans data = np.genfromtxt("./winequality-red.csv", dtype= np.float32, delimiter = ";", skip_header= 1) X=data[:, 0:11] cluster=int(input("Input the number of clusters: ")) model = KMeans(n_clusters = cluster, random_state=0) model.fit(X) first=int(input("Input the number of the first wine: ")) second=int(input("Input the number of the second wine: ")) if model.predict([data[first, 0:11]])== model.predict([data[second, 0:11]]): print("Result : %d and %d are in the same cluster"%(first,second)) else: print("Result : %d and %d are in the different cluster"%(first,second))