def categorise_dataset(contents): iris_setosa = [] iris_versicolor = [] iris_virginica = [] for each_tuple in contents: if each_tuple[4] == 'Iris-virginica': iris_virginica.append(each_tuple[:4]) elif each_tuple[4] == 'Iris-versicolor': iris_versicolor.append(each_tuple[:4]) elif each_tuple[4] == 'Iris-setosa': iris_setosa.append(each_tuple[:4]) kwargs = { 'n_init': 5, # depends on number of cores in your machine. 'n_jobs': 3, 'n_clusters': 3, } kmeans = KMeans() kmeans.set_params(**kwargs) # apply kmeans iris_setosa_centroids_indices = kmeans.fit_predict(np.array(iris_setosa)) iris_setosa_centroids = kmeans.cluster_centers_ iris_versicolor_centroids_indices = kmeans.fit_predict( np.array(iris_versicolor)) iris_versicolor_centroids = kmeans.cluster_centers_ iris_virginica_centroids_indices = kmeans.fit_predict( np.array(iris_virginica)) iris_virginica_centroids = kmeans.cluster_centers_ return (iris_setosa_centroids, iris_versicolor_centroids, iris_virginica_centroids)
def run_kmeans_2(trainX): #find k cluster_counts = { 'wine': 3, 'wage': 2, } model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 100), metric='calinski_harabasz', timings=True) visualizer.fit(X_train) visualizer.show() plt.tight_layout() plt.savefig('plots/km_sl_' + dataset + '.png') #validation model.set_params(n_clusters=cluster_counts[dataset]) model.fit(X_train) score_fns = [ v_measure_score, homogeneity_score, completeness_score, ] cluster_validation_df = pd.DataFrame() for score in score_fns: cluster_validation_df.loc[score.__name__, 'score'] = score(y_test[y_test.columns[0]], model.predict(X_test)) print(cluster_validation_df)
def partition_cells_by_kmeans(data: AnnData, rep: str, n_jobs: int, n_clusters: int, n_clusters2: int, n_init: int, random_state: int) -> List[int]: start = time.time() n_jobs = effective_n_jobs(n_jobs) rep_key = "X_" + rep X = data.obsm[rep_key].astype("float64") km = KMeans(n_clusters=n_clusters, n_jobs=n_jobs, n_init = n_init, random_state=random_state) km.fit(X) coarse = km.labels_.copy() km.set_params(n_init = 1) labels = coarse.copy() base_sum = 0 for i in range(n_clusters): idx = coarse == i nc = min(n_clusters2, idx.sum()) km.set_params(n_clusters=nc) km.fit(X[idx,:]) labels[idx] = base_sum + km.labels_ base_sum += nc end = time.time() logger.info("partition_cells_by_kmeans finished in {:.2f}s.".format(end - start)) return labels
def categorise_dataset(contents): iris_setosa = [] iris_versicolor = [] iris_virginica = [] for each_tuple in contents: if each_tuple[4] == 'Iris-virginica': iris_virginica.append(each_tuple[:4]) elif each_tuple[4] == 'Iris-versicolor': iris_versicolor.append(each_tuple[:4]) elif each_tuple[4] == 'Iris-setosa': iris_setosa.append(each_tuple[:4]) kwargs = { 'n_init': 5, # depends on number of cores in your machine. 'n_jobs': 3, 'n_clusters': 3, } kmeans = KMeans() kmeans.set_params(**kwargs) # apply kmeans iris_setosa_centroids_indices = kmeans.fit_predict(np.array(iris_setosa)) iris_setosa_centroids = kmeans.cluster_centers_ iris_versicolor_centroids_indices = kmeans.fit_predict(np.array(iris_versicolor)) iris_versicolor_centroids = kmeans.cluster_centers_ iris_virginica_centroids_indices = kmeans.fit_predict(np.array(iris_virginica)) iris_virginica_centroids = kmeans.cluster_centers_ return (iris_setosa_centroids, iris_versicolor_centroids, iris_virginica_centroids)
def cluster(self, X, num_clusters, k_means=None): if k_means is None: k_means = KMeans() k_means.set_params(n_clusters=num_clusters) X_cluster_space = k_means.fit_transform(X) I = k_means.predict(X) return k_means, X_cluster_space, I
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ random_state = check_random_state(self.random_state) # Compute the number of cluster needed if self.ratio == 'auto': num_samples = self.stats_c_[self.min_c_] else: num_samples = int(self.stats_c_[self.min_c_] / self.ratio) # Create the clustering object kmeans = KMeans(n_clusters=num_samples, random_state=random_state) kmeans.set_params(**self.kwargs) # Start with the minority class X_min = X[y == self.min_c_] y_min = y[y == self.min_c_] # All the minority class samples will be preserved X_resampled = X_min.copy() y_resampled = y_min.copy() # Loop over the other classes under picking at random for key in self.stats_c_.keys(): # If the minority class is up, skip it. if key == self.min_c_: continue # Find the centroids via k-means kmeans.fit(X[y == key]) centroids = kmeans.cluster_centers_ # Concatenate to the minority class X_resampled = np.concatenate((X_resampled, centroids), axis=0) y_resampled = np.concatenate( (y_resampled, np.array([key] * num_samples)), axis=0) self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) return X_resampled, y_resampled
def kMeansScore(data, allKs=[1]): # K-MEANS km = KMeans() ks = [] kmScore = [] for k in allKs: km.set_params(n_clusters=k) ks.append(k) km.fit(data) kmScore.append(-km.inertia_) return ks, kmScore
def cluster_pieces(array, k_start = 3, k_stop = 15): ''' cluster pieces of puzzle ''' km = KMeans() param_grid_km = {'n_clusters':np.arange(k_start, k_stop), 'algorithm':['full','elkan']} km_cv = GridSearchCV(km, param_grid_km, cv = 5).fit(array) print(f'Best parameters for KMeans model: {km_cv.best_params_}') km.set_params(**km_cv.best_params_).fit(array) return km
def evaluate_kmeans(X, y, problem, out='./results/Clustering/'): """Also evaluate kmeans and em both""" sm = SMOTE() X_res, y_res = sm.fit_sample(X, y) SSE = defaultdict(dict) ll = defaultdict(dict) distort_km = [] distort_gm = [] acc = defaultdict(lambda: defaultdict(dict)) adjMI = defaultdict(lambda: defaultdict(dict)) km = KMeans(random_state=5) gm = GM(random_state=5) st = clock() clusters = [2, 3, 4, 5, 6] for k in clusters: print('now doing k=' + str(k)) km.set_params(n_clusters=k) gm.set_params(n_components=k) km.fit(X_res) gm.fit(X_res) #distort_km.append(sum(np.min(cdist(X, km.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) ##distort_gm.append(sum(np.min(cdist(X, gm.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) SSE[k][problem] = km.score(X_res) ll[k][problem] = gm.score(X_res) print('km score:', SSE[k][problem]) print('gm score:', ll[k][problem]) acc[k][problem]['Kmeans'] = cluster_acc(y_res, km.predict(X_res)) acc[k][problem]['GM'] = cluster_acc(y_res, gm.predict(X_res)) adjMI[k][problem]['Kmeans'] = metrics.adjusted_mutual_info_score( y_res, km.predict(X_res)) adjMI[k][problem]['GM'] = metrics.adjusted_mutual_info_score( y_res, gm.predict(X_res)) print(k, clock() - st) SSE = (-pd.DataFrame(SSE)).T SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True) ll = pd.DataFrame(ll).T ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True) acc = pd.Panel(acc) adjMI = pd.Panel(adjMI) SSE.to_csv(out + problem + ' SSE.csv') ll.to_csv(out + problem + ' logliklihood.csv') acc.ix[:, :, problem].to_csv(out + problem + ' acc.csv') acc.ix[:, :, problem, ].to_csv(out + problem + ' acc.csv') adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv') adjMI.ix[:, :, problem].to_csv(out + problem + ' adjMI.csv') return SSE, ll, acc, adjMI, km, gm
def test_cluster_purity(X, Y, X_test=None, Y_test=None, label_names=None, mnist=False): num_clusters = 4 num_dims = None if mnist and num_dims is not None: pca = PCA(n_components=num_dims) X_orig = X X_pca = pca.fit_transform(X) X = X_pca print 'PCA dims=' + str(num_dims) k_means = KMeans() k_means.set_params(n_clusters=num_clusters) print_cluster_purity(k_means, X, Y)
def cluster_sweep(features, labels, seed, save_plot, show_plot, n_clusters=20, step=1): """ Performs a sweep across different numbers of clusters to determine the optimal number of for classification for this data set. """ cluster_analysis = [] model = KMeans() # Sweeps through the range of numbers of clusters, starting at 10 because of the 10 initial classes. print("Performing sweep of clusters...") with click.progressbar(range(10, n_clusters + 1, step)) as cluster_range: for cluster_size in cluster_range: model.set_params(n_clusters=cluster_size) numpy.random.set_state(seed) predictions = model.fit_predict(features) cluster_analysis.append( (cluster_size, score_clustering(labels, predictions))) # Plots the results from the cluster sweep data = list(zip(*[(x, *y.values()) for x, y in cluster_analysis])) if show_plot or save_plot: name = "Cluster" handles = plt.plot(data[0], data[3], '-b', label=name + " V Score") handles += plt.plot(data[0], data[4], '--b', label=name + " Rand") plt.legend(handles, loc="lower left") plt.xlabel("Number of Clusters") plt.title("Performance Comparison with K-Clusters") if save_plot is not None: path = os.path.join(save_plot, "cluster_sweep.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Returns the best scoring number of clusters based only on the adjusted_random_score # because this score is more accurate for high numbers of clusters. score = [ rand for _, _, _, rand in [scores.values() for (_, scores) in cluster_analysis] ] score = numpy.argmax(score) + 10 print(f"Best performance out of {n_clusters} clusters: {score}") return score
def kmeans_training(self, X, num_clusters): """ in : - X < [studentID, x, y] > - max_num_clusters out: - < [studentID, x, y, clusterID] > """ model = KMeans() model.set_params(n_clusters=num_clusters) # No need for studentID column during training model.fit(X.T[1:].T) students_clusters = model.labels_ # Add a final column with students_clusters labels output = np.zeros((len(X), len(X[0]) + 1)) output[:, :-1] = X output.T[-1] = students_clusters return output
def tester(self): meandist = [] homogeneity_scores = [] completeness_scores = [] accuracy_scores = [] silhouette_scores = [] km = KMeans(max_iter=500, random_state=rand_state, init='k-means++') for k in self.num_clusters: km = km.set_params(n_clusters=k) km.fit_transform(self.data) predicts = km.labels_ min = np.min(np.square( cdist(self.data, km.cluster_centers_, 'euclidean')), axis=1) value = np.mean(min) meandist.append(value) homogeneity_scores.append( metrics.homogeneity_score(self.target, predicts)) completeness_scores.append( metrics.completeness_score(self.target, predicts)) silhouette_scores.append( metrics.silhouette_score(self.data, predicts)) y_pred = cluster_predictions(self.target, predicts) accuracy_scores.append(metrics.accuracy_score(self.target, y_pred)) df_sil = pd.DataFrame(silhouette_scores) df_acc = pd.DataFrame(accuracy_scores) df_sil.to_csv('../data/plots/' + self.title + '-KM-silhouette_scores.csv') df_acc.to_csv('../data/plots/' + self.title + '-KM-accuracy_scores.csv') if self.gen_plot: self.plot(meandist, homogeneity_scores, completeness_scores)
def resample(self): ''' :param ratio: The ratio of number of majority cluster centroids with respect to :param n_jobs: :param kargs: :return: ''' # Create the clustering object from sklearn.cluster import KMeans kmeans = KMeans(random_state=self.rs) kmeans.set_params(**self.kargs) # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Loop over the other classes under picking at random print('Finding cluster centroids...', end="") for key in self.ucd.keys(): # If the minority class is up, skip it. if key == self.minc: continue # Set the number of clusters to be no more than the number of samples if self.ratio * self.ucd[self.minc] > self.ucd[key]: nclusters = self.ucd[key] else: nclusters = int(self.ratio * self.ucd[self.minc]) # Set the number of clusters and find the centroids kmeans.set_params(n_clusters = nclusters) kmeans.fit(self.x[self.y == key]) centroids = kmeans.cluster_centers_ # Concatenate to the minority class underx = concatenate((underx, centroids), axis = 0) undery = concatenate((undery, ones(nclusters) * key), axis = 0) print(".", end="") print("done!") return underx, undery
def run_kmeans(X_train, X_test, y_train, y_test): LOGGER.info('kmeans, train: {}, test: {}'.format(X_train.shape[0], X_test.shape[0])) # max_clusters = 7 # clusters=[2**x for x in range(1,max_clusters)] clusters = [x for x in range(1, 100)] # split_ratio = 0.33 # X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=split_ratio,random_state=0) # LOGGER.debug('train test split: {}'.format(split_ratio)) model = KMeans(random_state=0) # kms = [KMeans(n_clusters=i) for i in clusters] # choose_scores = [kms[i].fit(X_train).score(X) for i in range(len(kms))] # Validation score_fns = [ # mutual_info_score, v_measure_score, homogeneity_score, completeness_score, # adjusted_mutual_info_score, # calinski_harabasz_score, ] # validation_score = pd.DataFrame(index=clusters,columns=sum([[score.__name__+'_train',score.__name__+'_test'] for score in score_fns],[])) validation_score = pd.DataFrame(index=clusters) choose_score = pd.DataFrame(index=clusters, columns=['score']) for k in clusters: # model = KMeans(random_state=0) LOGGER.debug('clusters: {}'.format(k)) model.set_params(n_clusters=k) model.fit(X_train) sse_score = model.score(X_train) choose_score.loc[k, 'score'] = sse_score for score in score_fns: LOGGER.debug('evaluation: {}'.format(score.__name__)) # validation_score.loc[k,score.__name__+'_train'] = score(y_train[y_train.columns[0]],model.predict(X_train)) validation_score.loc[k, score.__name__ + '_test'] = score( y_test[y_test.columns[0]], model.predict(X_test)) # validation_score.loc[k,'k'] = k return validation_score, choose_score, model
def resample(self): """ ??? :return: """ # Compute the ratio if it is auto if self.ratio == 'auto': self.ratio = 1. # Create the clustering object from sklearn.cluster import KMeans kmeans = KMeans(random_state=self.rs) kmeans.set_params(**self.kwargs) # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Loop over the other classes under picking at random for key in self.ucd.keys(): # If the minority class is up, skip it. if key == self.minc: continue # Set the number of clusters to be no more than the number of # samples if self.ratio * self.ucd[self.minc] > self.ucd[key]: n_clusters = self.ucd[key] else: n_clusters = int(self.ratio * self.ucd[self.minc]) # Set the number of clusters and find the centroids kmeans.set_params(n_clusters=n_clusters) kmeans.fit(self.x[self.y == key]) centroids = kmeans.cluster_centers_ # Concatenate to the minority class underx = concatenate((underx, centroids), axis=0) undery = concatenate((undery, ones(n_clusters) * key), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def partition_cells_by_kmeans( X: np.ndarray, n_clusters: int, n_clusters2: int, n_init: int, n_jobs: int, random_state: int, min_avg_cells_per_final_cluster: Optional[int] = 10, ) -> List[int]: n_clusters = min(n_clusters, max(X.shape[0] // min_avg_cells_per_final_cluster, 1)) if n_clusters == 1: return np.zeros(X.shape[0], dtype=np.int32) n_jobs = eff_n_jobs(n_jobs) kmeans_params = { 'n_clusters': n_clusters, 'n_init': n_init, 'random_state': random_state, } km = KMeans(**kmeans_params) with threadpool_limits(limits=n_jobs): km.fit(X) coarse = km.labels_.copy() km.set_params(n_init=1) labels = coarse.copy() base_sum = 0 for i in range(n_clusters): idx = coarse == i nc = min(n_clusters2, max(idx.sum() // min_avg_cells_per_final_cluster, 1)) if nc == 1: labels[idx] = base_sum else: km.set_params(n_clusters=nc) km.fit(X[idx, :]) labels[idx] = base_sum + km.labels_ base_sum += nc return labels
def plot_selected_clusternumber_silhouette_scores( data, min, max, ): print("Tuning Silhouette: ") # candidate values for our number of cluster parameters = [ 18, 19, 20, 23, 25, 28, 19, 30, 31, 32, 33, 34, 35, 40, 41, 45, 46 ] parameters = range(min, max) # instantiating ParameterGrid, pass number of clusters as input parameter_grid = ParameterGrid({'n_clusters': parameters}) best_score = -1 kmeans_model = KMeans(random_state=42) # instantiating KMeans model silhouette_scores = [] # evaluation based on silhouette_score for p in parameter_grid: kmeans_model.set_params(**p) # set current hyper parameter kmeans_model.fit( data ) # fit model on wine dataset, this will find clusters based on parameter p ss = silhouette_score( data, kmeans_model.labels_) # calculate silhouette_score silhouette_scores += [ss] # store all the scores #print('Parameter:', p, 'Score', ss) # check p which has the best score if ss > best_score: best_score = ss best_grid = p # plotting silhouette score plt.figure(figsize=(10, 8)) plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#722f59', width=0.4) plt.xticks(range(len(silhouette_scores)), list(parameters)) plt.title('Silhouette Score', fontweight='bold') plt.xlabel('Number of Clusters', fontsize=14) plt.show()
def elbow_method(self, X, max_num_clusters): """ in : - X < [studentID, x, y] > - max_num_clusters out: - output scores for different #clusters """ # Remove the studentID column X = X.T[1:].T model = KMeans() k_clusters = np.arange(1, max_num_clusters) scores = [] for k in k_clusters: model.set_params(n_clusters=k) model.fit(X) scores.append(model.score(X)) plt.plot(k_clusters, scores) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('Scores') plt.show()
def kMeansScore(data, allKs=[1], datasetType=None, target=None): # K-MEANS km = KMeans() ks = [] kmScore = [] f = open('plots/carPlots/KMeanClusterStats_' + datasetType + '.txt', 'w') targetLabels, targetStats = np.unique(target, return_counts=True) f.write("Data Target Labels: " + str(targetLabels) + "\n") f.write("Data Target Stats: " + str(targetStats) + "\n\n\n") for k in allKs: km.set_params(n_clusters=k) ks.append(k) km.fit(data) kmScore.append(-km.inertia_) labels, stats = np.unique(km.labels_, return_counts=True) f.write("Cluster Stats For K = " + str(k) + "\n") f.write("Unique Labels: " + str(labels) + "\n") f.write("Status Corresponding To The Labels: " + str(stats) + "\n\n") f.close() return ks, kmScore
class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on clustering methods. Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Read more in the :ref:`User Guide <cluster_centroids>`. Parameters ---------- {sampling_strategy} {random_state} estimator : object, optional(default=KMeans()) Pass a :class:`sklearn.cluster.KMeans` estimator. voting : str, optional (default='auto') Voting strategy to generate the new samples: - If ``'hard'``, the nearest-neighbors of the centroids found using the clustering algorithm will be used. - If ``'soft'``, the centroids found by the clustering algorithm will be used. - If ``'auto'``, if the input is sparse, it will default on ``'hard'`` otherwise, ``'soft'`` will be used. .. versionadded:: 0.3.0 n_jobs : int, optional (default=1) The number of threads to open if possible. ratio : str, dict, or callable .. deprecated:: 0.4 Use the parameter ``sampling_strategy`` instead. It will be removed in 0.6. Notes ----- Supports multi-class resampling by sampling each class independently. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) ... # doctest: +ELLIPSIS Resampled dataset shape Counter({{...}}) """ def __init__(self, sampling_strategy='auto', random_state=None, estimator=None, voting='auto', n_jobs=1, ratio=None): super().__init__( sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.estimator = estimator self.voting = voting self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the KMeans estimator""" if self.estimator is None: self.estimator_ = KMeans( random_state=self.random_state, n_jobs=self.n_jobs) elif isinstance(self.estimator, KMeans): self.estimator_ = clone(self.estimator) else: raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == 'hard': nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors( centroids, return_distance=False) X_new = safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids, dtype=X.dtype) else: X_new = centroids y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype) return X_new, y_new def _fit_resample(self, X, y): self._validate_estimator() if self.voting == 'auto': if sparse.issparse(X): self.voting_ = 'hard' else: self.voting_ = 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError("'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled, dtype=y.dtype) def _more_tags(self): return {'sample_indices': False}
from sklearn.neighbors.nearest_centroid import NearestCentroid km = KMeans(random_state=6, n_init=10) gmm = GaussianMixture(random_state=6, n_init=1) def plot_sil(X, mod, t): mod.fit(X) mod.score(X) pred = mod.predict(X) clf = NearestCentroid() clf.fit(X, pred) plot_silhouettes(X, pred, clf.centroids_, title=t) km.set_params(n_clusters=5) t = "Silhouette Analysis, Titanic k-Means with n_clusters = %d" % km.n_clusters X = datasetsPCA['Titanic']['X_train'] plot_sil(X, km, t) gmm.set_params(n_components=15) t = "Post-PCA Silhouette Analysis, Titanic GMM with %d components" % gmm.n_components plot_sil(X, gmm, t) #plt.gca().set_xlim([-0.6,1.5]) km.set_params(n_clusters=6) t = "Silhouette Analysis, Wilt k-Means with n_clusters = %d" % km.n_clusters X = datasetsPCA['Wilt']['X_train'] plot_sil(X, km, t) gmm.set_params(n_components=6) t = "Post-PCA Silhouette Analysis, Wilt GMM with %d components" % gmm.n_components
class ClusterCentroids(BaseMulticlassSampler): """Perform under-sampling by generating centroids based on clustering methods. Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. estimator : object, optional(default=KMeans()) Pass a `sklearn.cluster.KMeans` estimator. n_jobs : int, optional (default=1) The number of threads to open if possible. Attributes ---------- min_c_ : str or int The identifier of the minority class. max_c_ : str or int The identifier of the majority class. stats_c_ : dict of str/int : int A dictionary in which the number of occurences of each class is reported. X_shape_ : tuple of int Shape of the data `X` during fitting. Notes ----- This class support multi-class. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape {}'.format(Counter(y))) Original dataset shape Counter({1: 900, 0: 100}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) Resampled dataset shape Counter({0: 100, 1: 100}) """ def __init__(self, ratio='auto', random_state=None, estimator=None, n_jobs=1): super(ClusterCentroids, self).__init__( ratio=ratio, random_state=random_state) self.estimator = estimator self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the NN estimator""" if self.estimator is None: self.estimator_ = KMeans( random_state=self.random_state, n_jobs=self.n_jobs) elif isinstance(self.estimator, KMeans): self.estimator_ = self.estimator else: raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ super(ClusterCentroids, self).fit(X, y) self._validate_estimator() return self def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Compute the number of cluster needed if self.ratio == 'auto': num_samples = self.stats_c_[self.min_c_] else: num_samples = int(self.stats_c_[self.min_c_] / self.ratio) # Set the number of sample for the estimator self.estimator_.set_params(**{'n_clusters': num_samples}) # Start with the minority class X_min = X[y == self.min_c_] y_min = y[y == self.min_c_] # All the minority class samples will be preserved X_resampled = X_min.copy() y_resampled = y_min.copy() # Loop over the other classes under picking at random for key in self.stats_c_.keys(): # If the minority class is up, skip it. if key == self.min_c_: continue # Find the centroids via k-means self.estimator_.fit(X[y == key]) centroids = self.estimator_.cluster_centers_ # Concatenate to the minority class X_resampled = np.concatenate((X_resampled, centroids), axis=0) y_resampled = np.concatenate( (y_resampled, np.array([key] * num_samples)), axis=0) self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) return X_resampled, y_resampled
class KMeans(ModelBase): """ KMeans: Fits an Sklearn KMeans model to X. See also -------- http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html Attributes ---------- n_clusters_ : int The number of clusters, K cluster_inertia_ : float Sum of squared distances of samples to their closest cluster center cluster_labels_ : array, [n_clusters_] Labels indicating the membership of each point cluster_centers_ : array, [n_clusters, n_features] Coordinates of cluster centers sample_labels_ : array, [n_samples] Labels for each of the samples in X sample_distances_ : array, [n_samples] The distance between each sample point and its cluster's center Constants --------- SAMPLE_CUTOFF_ : int If n_samples > SAMPLE_CUTOFF_ then sample distances are NOT recorded """ SAMPLE_CUTOFF_ = 1000 def __init__(self): self.model_ = None self.n_clusters_ = None self.sample_labels_ = None self.sample_distances_ = None @property def cluster_inertia_(self): # Sum of squared distances of samples to their closest cluster center return None if self.model_ is None else \ self.model_.inertia_ @property def cluster_labels_(self): # Cluster membership labels for each point return None if self.model_ is None else \ copy.deepcopy(self.model_.labels_) @property def cluster_centers_(self): # Coordinates of the cluster centers return None if self.model_ is None else \ copy.deepcopy(self.model_.cluster_centers_) def _reset(self): """Resets all attributes (erases the model)""" self.model_ = None self.n_clusters_ = None self.sample_labels_ = None self.sample_distances_ = None def fit(self, X, K, sample_labels=None, estimator_params=None): """Fits a Sklearn KMeans model to X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data. K : int The number of clusters. sample_labels : array-like, shape (n_samples), optional Labels for each of the samples in X. estimator_params : dict, optional The parameters to pass to the KMeans estimators. Returns ------- self """ self._reset() # Note: previously set n_init=50 self.model_ = SklearnKMeans(K) if estimator_params is not None: assert isinstance(estimator_params, dict) self.model_.set_params(**estimator_params) # Compute Kmeans model self.model_.fit(X) if sample_labels is None: sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])] assert len(sample_labels) == X.shape[0] self.sample_labels_ = np.array(sample_labels) self.n_clusters_ = K # Record sample label/distance from its cluster center self.sample_distances_ = OrderedDict() for cluster_label in range(self.n_clusters_): assert cluster_label not in self.sample_distances_ member_rows = X[self.cluster_labels_ == cluster_label, :] member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label] centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0) # "All clusters must have at least 1 member!" if member_rows.shape[0] == 0: return None # Calculate distance between each member row and the current cluster dists = np.empty(member_rows.shape[0]) dist_labels = [] for j, (row, label) in enumerate(zip(member_rows, member_labels)): dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze() dist_labels.append(label) # Sort the distances/labels in ascending order sort_order = np.argsort(dists) dists = dists[sort_order] dist_labels = np.array(dist_labels)[sort_order] self.sample_distances_[cluster_label] = { "sample_labels": dist_labels, "distances": dists, } return self def get_closest_samples(self): """Returns a list of the labels of the samples that are located closest to their cluster's center. Returns ---------- closest_samples : list A list of the sample labels that are located the closest to their cluster's center. """ if self.sample_distances_ is None: raise Exception("No model has been fit yet!") return [ samples['sample_labels'][0] for samples in list(self.sample_distances_.values()) ] def get_memberships(self): ''' Return the memberships in each cluster ''' memberships = OrderedDict() for cluster_label, samples in list(self.sample_distances_.items()): memberships[cluster_label] = OrderedDict([ (l, d) for l, d in zip(samples["sample_labels"], samples["distances"]) ]) return json.dumps(memberships, indent=4)
'X,y,oversampler', [ ( np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0), (4.0, 4.0)]), np.array([0, 0, 1, 1, 1]), ClusterOverSampler( oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE)), ), ( np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0), (4.0, 4.0)]), np.array([0, 0, 1, 1, 1]), ClusterOverSampler( oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE), clusterer=CLUSTERER.set_params(n_clusters=3), random_state=RANDOM_STATE, ), ), ], ) def test_fit_resample_intra_corner_cases(X, y, oversampler): """Test the fit_resample method for various corner cases and oversamplers.""" X_res, y_res = oversampler.fit_resample(X, y) y_count = Counter(y_res) assert y_count[0] == y_count[1] assert X.item(0, 0) <= X_res.item(-1, 0) <= X.item(1, 0) assert X.item(0, 1) <= X_res.item(-1, 1) <= X.item(1, 1)
def fit(self, X, y=None, **kwargs): """Fit the encoder on a collection of data, e.g. image patches. Parameters ---------- X: array-like, shape: n_samples, n_features the patch data to be fitted Returns ------- self: object Returns the object itself """ X = np.atleast_2d(X) n_samples, n_features = X.shape # normalize each patch individually if self.local_contrast: if self.verbose: print "Local contrast normalization of the data" X = self.local_contrast_normalization(X) # kmeans model to find the filters if self.verbose: print "About to extract atoms from %d samples" % n_samples kmeans = KMeans(n_clusters=self.n_atoms, init='k-means++', max_iter=self.max_iter, n_init=self.n_init, tol=self.tol, verbose=self.verbose) if self.whiten: if self.verbose: print "Whitening PCA of the samples" self.pca = pca = PCA(whiten=True, n_components=self.n_components) pca.fit(X) X = pca.transform(X) # compute the KMeans centers if 0 < self.n_prefit < pca.n_components: if self.verbose: print "First KMeans in simplified curriculum space" # starting the kmeans on a the projection to the first singular # components: curriculum learning trick by Andrej Karpathy kmeans.fit(X[:, :self.n_prefit]) # warm restart by padding previous centroids with zeros # with full dimensionality this time kmeans.init = np.zeros((self.n_atoms, pca.n_components), dtype=kmeans.cluster_centers_.dtype) kmeans.init[:, :self.n_prefit] = kmeans.cluster_centers_ if self.verbose: print "Second KMeans in full whitened sample space" kmeans.set_params(n_init=1).fit(X) else: if self.verbose: print "KMeans in full original sample space" # regular kmeans fit (without the curriculum trick) kmeans.fit(X) # project back the centers in original, non-whitened space (useful # for qualitative inspection of the filters) self.components_ = self.pca.inverse_transform( kmeans.cluster_centers_) else: # find the kernel in the raw original dimensional space # TODO: experiment with component wise scaling too self.pca = None kmeans.fit(X) self.components_ = kmeans.cluster_centers_ self.kmeans = kmeans self.inertia_ = kmeans.inertia_ return self
class ClusterCentroids(BaseMulticlassSampler): """Perform under-sampling by generating centroids based on clustering methods. Experimental method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balance the dataset. Otherwise, the ratio is defined as the number of samples in the minority class over the the number of samples in the majority class. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. estimator : object, optional(default=KMeans()) Pass a `sklearn.cluster.KMeans` estimator. n_jobs : int, optional (default=1) The number of threads to open if possible. Attributes ---------- min_c_ : str or int The identifier of the minority class. max_c_ : str or int The identifier of the majority class. stats_c_ : dict of str/int : int A dictionary in which the number of occurences of each class is reported. X_shape_ : tuple of int Shape of the data `X` during fitting. Notes ----- This class support multi-class. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape {}'.format(Counter(y))) Original dataset shape Counter({1: 900, 0: 100}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) Resampled dataset shape Counter({0: 100, 1: 100}) """ def __init__(self, ratio='auto', random_state=None, estimator=None, n_jobs=1): super(ClusterCentroids, self).__init__( ratio=ratio, random_state=random_state) self.estimator = estimator self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the NN estimator""" if self.estimator is None: self.estimator_ = KMeans( random_state=self.random_state, n_jobs=self.n_jobs) elif isinstance(self.estimator, KMeans): self.estimator_ = self.estimator else: raise ValueError('`estimator` has to be a KMeans clustering.') def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ super(ClusterCentroids, self).fit(X, y) self._validate_estimator() return self def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Compute the number of cluster needed if self.ratio == 'auto': num_samples = self.stats_c_[self.min_c_] else: num_samples = int(self.stats_c_[self.min_c_] / self.ratio) # Set the number of sample for the estimator self.estimator_.set_params(**{'n_clusters': num_samples}) # Start with the minority class X_min = X[y == self.min_c_] y_min = y[y == self.min_c_] # All the minority class samples will be preserved X_resampled = X_min.copy() y_resampled = y_min.copy() # Loop over the other classes under picking at random for key in self.stats_c_.keys(): # If the minority class is up, skip it. if key == self.min_c_: continue # Find the centroids via k-means self.estimator_.fit(X[y == key]) centroids = self.estimator_.cluster_centers_ # Concatenate to the minority class X_resampled = np.concatenate((X_resampled, centroids), axis=0) y_resampled = np.concatenate( (y_resampled, np.array([key] * num_samples)), axis=0) self.logger.info('Under-sampling performed: %s', Counter(y_resampled)) return X_resampled, y_resampled
class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on clustering methods. Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Read more in the :ref:`User Guide <cluster_centroids>`. Parameters ---------- ratio : str, dict, or callable, optional (default='auto') Ratio to use for resampling the data set. - If ``str``, has to be one of: (i) ``'minority'``: resample the minority class; (ii) ``'majority'``: resample the majority class, (iii) ``'not minority'``: resample all classes apart of the minority class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: correspond to ``'all'`` with for over-sampling methods and ``'not minority'`` for under-sampling methods. The classes targeted will be over-sampled or under-sampled to achieve an equal number of sample with the majority or minority class. - If ``dict``, the keys correspond to the targeted classes. The values correspond to the desired number of samples. - If callable, function taking ``y`` and returns a ``dict``. The keys correspond to the targeted classes. The values correspond to the desired number of samples. random_state : int, RandomState instance or None, optional (default=None) If int, ``random_state`` is the seed used by the random number generator; If ``RandomState`` instance, random_state is the random number generator; If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. estimator : object, optional(default=KMeans()) Pass a :class:`sklearn.cluster.KMeans` estimator. voting : str, optional (default='auto') Voting strategy to generate the new samples: - If ``'hard'``, the nearest-neighbors of the centroids found using the clustering algorithm will be used. - If ``'soft'``, the centroids found by the clustering algorithm will be used. - If ``'auto'``, if the input is sparse, it will default on ``'hard'`` otherwise, ``'soft'`` will be used. .. versionadded:: 0.3.0 n_jobs : int, optional (default=1) The number of threads to open if possible. Notes ----- Supports mutli-class resampling by sampling each class independently. See :ref:`sphx_glr_auto_examples_under-sampling_plot_cluster_centroids.py`. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape {}'.format(Counter(y))) Original dataset shape Counter({1: 900, 0: 100}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_sample(X, y) >>> print('Resampled dataset shape {}'.format(Counter(y_res))) ... # doctest: +ELLIPSIS Resampled dataset shape Counter({...}) """ def __init__(self, ratio='auto', random_state=None, estimator=None, voting='auto', n_jobs=1): super(ClusterCentroids, self).__init__( ratio=ratio) self.random_state = random_state self.estimator = estimator self.voting = voting self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the KMeans estimator""" if self.estimator is None: self.estimator_ = KMeans( random_state=self.random_state, n_jobs=self.n_jobs) elif isinstance(self.estimator, KMeans): self.estimator_ = self.estimator else: raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == 'hard': nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors(centroids, return_distance=False) X_new = safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids) else: X_new = centroids y_new = np.array([target_class] * centroids.shape[0]) return X_new, y_new def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : array-like, shape (n_samples,) Corresponding label for each sample in X. Returns ------- X_resampled : {ndarray, sparse matrix}, shape \ (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new,) The corresponding label of `X_resampled` """ self._validate_estimator() if self.voting == 'auto': if sparse.issparse(X): self.voting_ = 'hard' else: self.voting_ = 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError("'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.ratio_.keys(): n_samples = self.ratio_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled)
dim = [2, 3, 4, 5] km = KM(random_state=42) gmm = GMM(random_state=42) Score = defaultdict(list) adjMI = defaultdict(list) S_homog = defaultdict(list) S_adjMI = defaultdict(list) S_vm = defaultdict(list) for i in dim: reduced_X = PCA(n_components=i, random_state=42).fit_transform(X_train_scaled) k = 30 km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(reduced_X) gmm.fit(reduced_X) S_homog['km'].append( metrics.homogeneity_score(labels, km.predict(reduced_X))) S_homog['gmm'].append( metrics.homogeneity_score(labels, gmm.predict(reduced_X))) S_adjMI['km'].append( metrics.adjusted_mutual_info_score(labels, km.predict(reduced_X))) S_adjMI['gmm'].append( metrics.adjusted_mutual_info_score(labels, gmm.predict(reduced_X))) S_vm['km'].append(metrics.v_measure_score(labels, km.predict(reduced_X))) S_vm['gmm'].append(metrics.v_measure_score(labels, gmm.predict(reduced_X))) #plt.legend(['Train', 'Test'], loc='lower right')
class assignment4: def __init__(self): # data processing self.dataSetPath = './data_set/' self.dataSetName = "" self.csv_delimiter = ',' self.data = None self.allFeatures = [] self.allTarget = [] # not used self.XTrain = None self.XTest = None self.YTrain = None self.YTest = None # k-mean clustering self.kNum = range(1, 21) self.kmean = None self.kmeanRD = None # expectation maximization self.em = None self.emRD = None # PCA self.pca = None self.pcaDims = range(1, 21) # ICA self.icaDims = range(1, 21) self.ica = None # RP self.rp = None self.rpDims = range(1, 21) # TSVD self.tsvd = None self.tsvdDims = range(1, 10) def read_data_voice(self, dataName): with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=self.csv_delimiter) self.data = list(reader) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.data))) print('Number of attributes: {}'.format(len(self.data[0]) - 1)) def read_data_haptX(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) print(len(self.data)) for elim in self.data: feature = [] for i in elim: feature.append(i) self.allFeatures.append(feature) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allFeatures))) print('Number of attributes: {}'.format(len(self.allFeatures[0]))) def read_data_haptY(self, dataName): self.data = None with open(self.dataSetPath + dataName, 'r', encoding="utf8") as file: reader = csv.reader(file, delimiter=',') self.data = list(reader) for elim in self.data: self.allTarget.append(elim) print("Reading data set: '{}'".format(self.dataSetPath + dataName)) print('Number of instances: {}'.format(len(self.allTarget))) print('Number of attributes: {}'.format(len(self.allTarget[0]))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.allTarget = self.allTarget.ravel() def split_data_to_train_test(self, testSize=0.3): # in case the data set are very different in format sample_len = len(self.data[0]) for elem in self.data: feature = elem[0:sample_len - 1] feature_vector = [] for f in feature: feature_vector.append(float(f)) self.allFeatures.append(feature_vector) if elem[-1] == '0': val = 0 else: val = 1 self.allTarget.append((float(val))) self.allFeatures = np.asarray(self.allFeatures, dtype=np.float32) self.allTarget = np.asarray(self.allTarget, dtype=np.float32) self.XTrain, self.XTest, self.YTrain, self.YTest = train_test_split( self.allFeatures, self.allTarget, test_size=testSize, random_state=42) print( 'Total X train data -> {}%'.format( int((len(self.XTrain) / len(self.data)) * 100)), 'Size:', len(self.XTrain)) print( 'Total X test data -> {}%'.format( int((len(self.XTest) / len(self.data)) * 100)), 'Size:', len(self.XTest)) print( 'Total Y train data -> {}%'.format( int((len(self.YTrain) / len(self.data)) * 100)), 'Size:', len(self.YTrain)) print( 'Total Y test data -> {}%'.format( int((len(self.YTest) / len(self.data)) * 100)), 'Size', len(self.YTest)) def get_max_idx(self, input): maxVal = input[0] maxIdx = 0 for i in range(1, len(input)): if input[i] > maxVal: maxIdx = i maxVal = input[i] return maxIdx def pairwiseDistCorr(self, X1, X2): assert X1.shape[0] == X2.shape[0] d1 = pairwise_distances(X1) d2 = pairwise_distances(X2) return np.corrcoef(d1.ravel(), d2.ravel())[0, 1] def k_mean_cluster(self): print("-" * 50) print('{}: K-mean clustering'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.kmean = KMeans(random_state=5, max_iter=1000) for i in self.kNum: self.kmean.set_params(n_clusters=i) self.kmean.fit(dataX) scores.append(sm.accuracy_score(self.allTarget, self.kmean.labels_)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, self.kmean.labels_)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:", confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('K-mean Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_KMEAN.png'.format(self.dataSetName)) print("-" * 50) def k_mean_cluster_reduced(self, n_clusters, reduced_data, name): print("-" * 50) print('{}: K-mean clustering {}'.format(self.dataSetName, name)) dataX = StandardScaler().fit_transform(self.allFeatures) self.kmeanRD = KMeans(n_clusters=n_clusters, random_state=5, max_iter=1000) self.kmeanRD.fit(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, self.kmeanRD.labels_))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, self.kmeanRD.labels_)) print("-" * 50) def expectation_maximization_reduced(self, n_components, reduced_data, name): print("-" * 50) print('{}: Expectation maximization {}'.format(self.dataSetName, name)) self.emRD = GaussianMixture(n_components=n_components, random_state=5) self.emRD.fit(reduced_data) y_predict = self.emRD.predict(reduced_data) print("Accuracy score:{0:.2f}".format( sm.accuracy_score(self.allTarget, y_predict))) print("Confusion Matrix:") print(sm.confusion_matrix(self.allTarget, y_predict)) print("-" * 50) def expectation_maximization(self): print("-" * 50) print('{}: Expectation maximization'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) scores = [] confusionMatrix = [] self.em = GaussianMixture(random_state=5) for i in self.kNum: self.em.set_params(n_components=i) self.em.fit(dataX) y_predict = self.em.predict(dataX) scores.append(sm.accuracy_score(self.allTarget, y_predict)) confusionMatrix.append( sm.confusion_matrix(self.allTarget, y_predict)) bestScoreIdx = self.get_max_idx(scores) print("Accuracy score:{0:.2f}".format(scores[bestScoreIdx])) print("Confusion Matrix:") print(confusionMatrix[bestScoreIdx]) plt.figure() plt.ylabel('Accuracy') plt.xlabel('# of Clusters') plt.title('Expectation Maximum Cluster ({})'.format(self.dataSetName)) plt.style.context('seaborn-whitegrid') plt.xticks(self.kNum) plt.plot(self.kNum, scores) plt.grid() plt.draw() plt.savefig('./{}_EM.png'.format(self.dataSetName)) print("-" * 50) def PCA(self): print("-" * 50) print('{}: Principal component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.pca = PCA(random_state=5) grid = {'pca__n_components': self.pcaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('pca', self.pca), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number PCA components:", search.best_params_) self.pca.fit(dataX) var = np.cumsum( np.round(self.pca.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('PCA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_PCA_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('PCA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.pcaDims) plt.ylim([0, 1]) plt.style.context('seaborn-whitegrid') plt.plot(self.pcaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_PCA_GS.png'.format(self.dataSetName)) print("-" * 50) def ICA(self): print("-" * 50) print('{}: Independent component analysis '.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.ica = FastICA(random_state=5, max_iter=6000) # kurtosis kurt = [] for dim in self.icaDims: self.ica.set_params(n_components=dim) tmp = self.ica.fit_transform(dataX) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt.append(tmp.abs().mean()) # grid search grid = {'ica__n_components': self.icaDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('ica', self.ica), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number ICA components:", search.best_params_) plt.figure() plt.ylabel('Kurtosis') plt.xlabel('# of Features') plt.title('ICA Analysis ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(kurt) plt.grid() plt.draw() plt.savefig('./{}_kurtosis.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('ICA Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.icaDims) plt.style.context('seaborn-whitegrid') plt.plot(self.icaDims, search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_ICA_GS.png'.format(self.dataSetName)) print("-" * 50) def RP(self): print("-" * 50) print('{}: Random Projection'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) disCorr = [] self.rp = SparseRandomProjection(random_state=5) for dim in self.rpDims: self.rp.set_params(n_components=dim) disCorr.append( self.pairwiseDistCorr(self.rp.fit_transform(dataX), dataX)) print(disCorr) # grid search grid = {'rp__n_components': self.rpDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('rp', self.rp), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number RP components:", search.best_params_) plt.figure() plt.ylabel('Distance') plt.xlabel('# of Features') plt.title('RP Analysis ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(disCorr) plt.grid() plt.draw() plt.savefig('./{}_distance.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('RP Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.rpDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_RP_GS.png'.format(self.dataSetName)) print("-" * 50) def TSVD(self): print("-" * 50) print('{}: TruncatedSVD'.format(self.dataSetName)) dataX = StandardScaler().fit_transform(self.allFeatures) self.tsvd = TruncatedSVD(random_state=5) # grid search grid = {'tsvd__n_components': self.tsvdDims} mlp = MLPClassifier(max_iter=2000, alpha=1e-5, early_stopping=False, random_state=5, hidden_layer_sizes=[17] * 11) pipe = Pipeline([('tsvd', self.tsvd), ('NN', mlp)]) search = GridSearchCV(pipe, grid, verbose=2, cv=5) search.fit(dataX, self.allTarget) print("Best number TSVD components:", search.best_params_) self.tsvd.fit(dataX) var = np.cumsum( np.round(self.tsvd.explained_variance_ratio_, decimals=3) * 100) plt.figure() plt.ylabel('% Variance Explained') plt.xlabel('# of Features') plt.title('TSVD Analysis ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(var) plt.grid() plt.draw() plt.savefig('./{}_TSD_VA.png'.format(self.dataSetName)) plt.figure() plt.ylabel('Score') plt.xlabel('# of Features') plt.title('TSVD Analysis Grid Search ({})'.format(self.dataSetName)) plt.xticks(self.tsvdDims) plt.style.context('seaborn-whitegrid') plt.plot(search.cv_results_['mean_test_score']) plt.grid() plt.draw() plt.savefig('./{}_TSVD_GS.png'.format(self.dataSetName)) print("-" * 50)
class KMeans(ModelBase): """ KMeans: Fits an Sklearn KMeans model to X. See also -------- http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html Attributes ---------- n_clusters_ : int The number of clusters, K cluster_inertia_ : float Sum of squared distances of samples to their closest cluster center cluster_labels_ : array, [n_clusters_] Labels indicating the membership of each point cluster_centers_ : array, [n_clusters, n_features] Coordinates of cluster centers sample_labels_ : array, [n_samples] Labels for each of the samples in X sample_distances_ : array, [n_samples] The distance between each sample point and its cluster's center Constants --------- SAMPLE_CUTOFF_ : int If n_samples > SAMPLE_CUTOFF_ then sample distances are NOT recorded """ SAMPLE_CUTOFF_ = 1000 def __init__(self): self.model_ = None self.n_clusters_ = None self.sample_labels_ = None self.sample_distances_ = None @property def cluster_inertia_(self): # Sum of squared distances of samples to their closest cluster center return None if self.model_ is None else \ self.model_.inertia_ @property def cluster_labels_(self): # Cluster membership labels for each point return None if self.model_ is None else \ copy.deepcopy(self.model_.labels_) @property def cluster_centers_(self): # Coordinates of the cluster centers return None if self.model_ is None else \ copy.deepcopy(self.model_.cluster_centers_) def _reset(self): """Resets all attributes (erases the model)""" self.model_ = None self.n_clusters_ = None self.sample_labels_ = None self.sample_distances_ = None def fit(self, X, K, sample_labels=None, estimator_params=None): """Fits a Sklearn KMeans model to X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data. K : int The number of clusters. sample_labels : array-like, shape (n_samples), optional Labels for each of the samples in X. estimator_params : dict, optional The parameters to pass to the KMeans estimators. Returns ------- self """ self._reset() # Note: previously set n_init=50 self.model_ = SklearnKMeans(K) if estimator_params is not None: assert isinstance(estimator_params, dict) self.model_.set_params(**estimator_params) # Compute Kmeans model self.model_.fit(X) if sample_labels is None: sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])] assert len(sample_labels) == X.shape[0] self.sample_labels_ = np.array(sample_labels) self.n_clusters_ = K # Record sample label/distance from its cluster center self.sample_distances_ = OrderedDict() for cluster_label in range(self.n_clusters_): assert cluster_label not in self.sample_distances_ member_rows = X[self.cluster_labels_ == cluster_label, :] member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label] centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0) # "All clusters must have at least 1 member!" if member_rows.shape[0] == 0: return None # Calculate distance between each member row and the current cluster dists = np.empty(member_rows.shape[0]) dist_labels = [] for j, (row, label) in enumerate(zip(member_rows, member_labels)): dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze() dist_labels.append(label) # Sort the distances/labels in ascending order sort_order = np.argsort(dists) dists = dists[sort_order] dist_labels = np.array(dist_labels)[sort_order] self.sample_distances_[cluster_label] = { "sample_labels": dist_labels, "distances": dists, } return self def get_closest_samples(self): """Returns a list of the labels of the samples that are located closest to their cluster's center. Returns ---------- closest_samples : list A list of the sample labels that are located the closest to their cluster's center. """ if self.sample_distances_ is None: raise Exception("No model has been fit yet!") return [samples['sample_labels'][0] for samples in list(self.sample_distances_.values())] def get_memberships(self): ''' Return the memberships in each cluster ''' memberships = OrderedDict() for cluster_label, samples in list(self.sample_distances_.items()): memberships[cluster_label] = OrderedDict( [(l, d) for l, d in zip(samples["sample_labels"], samples["distances"])]) return json.dumps(memberships, indent=4)
def k_clustering(ctx, sweep_features, sweep_variance, sweep_clusters): """ K-means clustering function to be run on dataset. Includes simple analysis of results. """ save_plot = ctx.obj["save_plot"] show_plot = ctx.obj["show_plot"] print("loading data...") features, boolean_labels, labels = load_data(ctx.obj["data_folder"], shuffle_seed=ctx.obj["seed"]) n_samples, n_features = features.shape features_with_labels = features.copy() features_with_labels[n_features] = labels # Save seed for consistent runs for data analysis seed = numpy.random.get_state() # Run k-clustering excluding the class attribute model = KMeans(n_clusters=10) print("running k-means clustering on all features except class...") numpy.random.set_state(seed) base_predictions = model.fit_predict(features) score_clustering(labels, base_predictions, print_score=True) # Run k-clustering including the class attribute print("running k-means clustering on all features including class...") numpy.random.set_state(seed) score_clustering(labels, model.fit_predict(features_with_labels), print_score=True) # Perform Analytical sweeps of features, variance and clusters best_feature_n = None if sweep_features: best_feature_n = feature_sweep(features, boolean_labels, labels, seed, save_plot, show_plot, n_features=20) if sweep_variance: variance_sweep(features, labels, seed, save_plot, show_plot, step=500) best_cluster_n = None if sweep_clusters: best_cluster_n = cluster_sweep(features, labels, seed, save_plot, show_plot, n_clusters=50, step=1) # Plotting the contingency matrix for the base prediction matrix = metrics.cluster.contingency_matrix(column_or_1d(labels), base_predictions) plt.imshow(matrix, cmap="hot") plt.title("Base Prediction mapping centroids against class labels") plt.xlabel("Cluster Centroid Label") plt.ylabel("Actual Label") if save_plot is not None: path = os.path.join(save_plot, "base_prediction_matrix.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Running k-clustering with results from sweep analysis print("Running k-means clustering with optimal settings") model.set_params(n_clusters=28) selector = SelectKBest(k=122) numpy.random.set_state(seed) optimal_predictions = model.fit_predict( selector.fit_transform(features, column_or_1d(labels))) score_clustering(labels, optimal_predictions, print_score=True) # Plotting contingency matrix from optimal predictions matrix = metrics.cluster.contingency_matrix(column_or_1d(labels), optimal_predictions) plt.imshow(matrix, cmap="hot") plt.title("Optimal Prediction mapping centroids against class labels") plt.xlabel("Cluster Centroid Label") plt.ylabel("Actual Label") if save_plot is not None: path = os.path.join(save_plot, "optimal_prediction_matrix.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Print out optimal results from sweep analysis if best_feature_n: print(f"Ideal number of k-best features is {best_feature_n}.") if best_feature_n: print(f"Ideal number of clusters is {best_cluster_n}.") print("Analysis Completed.")
def plot_loss_vs_cluster_number(X: numpy.ndarray, k_min: int, k_max: int, distance_function: Callable[ [numpy.ndarray, numpy.ndarray], float], *, algorithm_parameters: Dict[str, Any] = None, ax: Optional[axes.Axes] = None, **kwargs) -> axes.Axes: """ k-means requires you to decide the number of clusters ``k`` beforehand. This method runs the KMean algorithm and increases the cluster number at each try. The Total magnitude or sum of distance is used as loss. Right now the method only works with ``sklearn.cluster.KMeans``. :param X: Training instances. :param k_min: The minimum cluster number. :param k_max: The maximum cluster number. :param distance_function: The function used to calculate the distance between an instance to its cluster center. The function receives two ndarrays, one the instance and the second is the center and return a float number representing the distance between them. :param algorithm_parameters: parameters to use for the algorithm. If None, deafult parameters of ``KMeans`` will be used. :param ax: Axes object to draw the plot onto, otherwise uses the current Axes. :param kwargs: other keyword arguments All other keyword arguments are passed to ``matplotlib.axes.Axes.pcolormesh()``. :return: Returns the Axes object with the plot drawn onto it. """ if algorithm_parameters is None: algorithm_parameters = KMeans().get_params() if "n_clusters" in algorithm_parameters: del algorithm_parameters["n_clusters"] if ax is None: pyplot.figure() ax = pyplot.gca() result = [] for k in range(k_min, k_max + 1): estimator = KMeans(n_clusters=k) estimator.set_params(**algorithm_parameters) estimator.fit(X) magnitude = pandas.DataFrame( _extract_magnitude(X, estimator.labels_, estimator.cluster_centers_, distance_function)) result.append({"k": k, "magnitude": magnitude["distance"].sum()}) pandas.DataFrame(result).plot("k", "magnitude", kind="scatter", ax=ax, **kwargs) pyplot.xticks(range(max(0, k_min - 1), k_max + 2), rotation=0) ax.set_xlabel("Number of clusters") ax.set_ylabel("Total Point-to-Centroid Distance") ax.set_title("Loss vs Clusters Used") return ax
class CodeBook(BaseEstimator, ClusterMixin, TransformerMixin): """Code Book creation and manimpulation for Bag-of-(visual)Fetures. Parameters ---------- n_words : int, optional, default: 36 The number of clusters to form as well as the number of words (centroids) to generate. cluster_core : sklearn.cluster, default: KMeans Clustering technique used to quantisize the feature space to generate the code book. #TODO: its default should be described by _default_clustering() max_iter : int, default: 300 Maximum number of iterations of the k-means algorithm for a single run. n_init : int, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. init : {'k-means++', 'random' or an ndarray} Method for initialization, defaults to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. precompute_distances : {'auto', True, False} Precompute distances (faster but takes more memory). 'auto' : do not precompute distances if n_samples * n_words > 12 million. This corresponds to about 100MB overhead per job using double precision. True : always precompute distances False : never precompute distances tol : float, default: 1e-4 Relative tolerance with regards to inertia to declare convergence n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. verbose : int, default 0 Verbosity mode. copy_x : boolean, default True When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. Attributes ---------- cook_book_ : array, [n_words, n_features] Coordinates of cluster centers labels_ : Labels of each point inertia_ : float Sum of distances of samples to their closest cluster center. Notes ------ The k-means problem is solved using Lloyd's algorithm. The average complexity is given by O(k n T), were n is the number of samples and T is the number of iteration. The worst case complexity is given by O(n^(k+2/p)) with n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii, 'How slow is the k-means method?' SoCG2006) In practice, the k-means algorithm is very fast (one of the fastest clustering algorithms available), but it falls in local minima. That's why it can be useful to restart it several times. See also -------- dictionary_code """ #TODO: test n_words default = 36 #TODO: does make sense these paremters: max_iter, n_init #TODO: change the cluter_core from cluster_code=None to # cluster_code=_default_cluster(), doing all the apropiated # changes. Check that BaseEstimator asks for strict declaration # # def _default_cluster(self, n_words=36, # init='k-means++', n_init=10, max_iter=300, # tol=1e-4, precompute_distances='auto', # verbose=0, random_state=None, copy_x=True, n_jobs=1): # """Default space clustering strategy to determine the code book""" # from sklearn.cluster import KMeans # return KMeans(n_clusters=n_words, ...) # # Then self.set_param can also be used to setup the parameters for the # current classification methodology def __init__(self, n_words=36, cluster_core=None, init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1): if hasattr(init, '__array__'): n_words = init.shape[0] init = np.asarray(init, dtype=np.float64) self.n_words = n_words self.cluster_core_name = cluster_core self.init = init self.max_iter = max_iter self.tol = tol self.precompute_distances = precompute_distances self.n_init = n_init self.verbose = verbose self.random_state = random_state self.copy_x = copy_x self.n_jobs = n_jobs if self.cluster_core_name == 'random-words': self.n_init = 1 self.max_iter = 1 print 'The number of iterations and try as been fixed to 1.' if ( (self.cluster_core_name is None ) or (self.cluster_core_name == 'random-words') ): from sklearn.cluster import KMeans self.cluster_core = KMeans(n_clusters=self.n_words, init=self.init, max_iter=self.max_iter, tol=self.tol, precompute_distances=self.precompute_distances, n_init=self.n_init, verbose=self.verbose, random_state=self.random_state, copy_x=self.copy_x, n_jobs=self.n_jobs) def _check_fit_data(self, X): """Verify that the number of samples given is larger than n_words""" X = check_array(X, accept_sparse='csr', dtype=np.float64) if X.shape[0] < self.n_words: raise ValueError("n_samples=%d should be >= n_words=%d" % ( X.shape[0], self.n_words)) return X def _check_test_data(self, X): X = check_array(X, accept_sparse='csr') n_samples, n_features = X.shape expected_n_features = self.cook_book_.shape[1] if not n_features == expected_n_features: raise ValueError("Incorrect number of features. " "Got %d features, expected %d" % ( n_features, expected_n_features)) if X.dtype.kind != 'f': warnings.warn("Got data type %s, converted to float " "to avoid overflows" % X.dtype, RuntimeWarning, stacklevel=2) X = X.astype(np.float) return X def fit(self, X, y=None): """Compute the clustering of the space. #TODO: right now only for K_means, however a dispatcher is needed so that other clustering stragegies are called indisticntly Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) """ self.cluster_core = self.cluster_core.fit(X, y) return self def fit_predict(self, X, y=None): """Compute cluster centers and predict cluster index for each sample. Convenience method; equivalent to calling fit(X) followed by predict(X). """ #return self.fit(X).labels_ raise NotImplementedError def fit_transform(self, X, y=None): """Compute clustering and transform X to cluster-distance space. Equivalent to fit(X).transform(X), but more efficiently implemented. """ # Currently, this just skips a copy of the data if it is not in # np.array or CSR format already. # XXX This skips _check_test_data, which may change the dtype; # we should refactor the input validation. # # X = self._check_fit_data(X) # return self.fit(X)._transform(X) raise NotImplementedError def transform(self, X, y=None): """Transform X to a cluster-distance space. In the new space, each dimension is the distance to the cluster centers. Note that even if X is sparse, the array returned by `transform` will typically be dense. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. Returns ------- X_new : array, shape [n_samples, k] X transformed in the new space. """ # check_is_fitted(self, 'cook_book_') # X = self._check_test_data(X) # return self._transform(X) raise NotImplementedError def _transform(self, X): """guts of transform method; no input validation""" # return euclidean_distances(X, self.cook_book_) raise NotImplementedError def predict(self, X): """Predicts the index value of the closest word within the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the closest word within the code book. """ return self.cluster_core.predict(X) def get_dictionary(self): """Retrieves the words forming the code book Returns ------- dictionary : array, shape [n_words, n_features] Code book elements (words of the dictionary) represented in the feature space """ #TODO: check that the coodebook is fitted return self.cluster_core.cluster_centers_ def get_BoF_descriptor(self, X): # norm = lambda x: x.astype(float)/np.linalg.norm(x) # return norm(np.bincount(self.predict(X))) return np.histogram(self.predict(X), bins=range(self.n_words+1), density=True) def get_BoF_pramide_descriptor(self, X): """ Split the image (or volume) in a piramide manner and get a descriptor for each level (and part). Concatenate the output. TODO: build proper documentaiton """ def split_data_by2(X): # TODO: rewrite this in a nice manner that uses len(X.shape) # TODO: this can rise ERROR if length of X is odd parts = [np.split(x, 2, axis=2) for x in [np.split(x, 2, axis=1) for x in np.slit(X, 2, axis=0) ]] return parts def get_occurrences(X): return np.histogram(X, bins=range(self.n_words+1)) def build_piramide(X, level=2): if level is 0: return get_occurrences(X) else: return [get_occurrences(X)] + [build_piramide(Xpart, level-1) for Xpart in split_data_by2(X)] return build_piramide(self.predict(X)) def get_params(self, deep=True): return self.cluster_core.get_params() def set_params(self, **params): self.cluster_core.set_params(**params)
def _sample(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ random_state = check_random_state(self.random_state) # Compute the number of cluster needed if self.ratio == 'auto': num_samples = self.stats_c_[self.min_c_] else: num_samples = int(self.stats_c_[self.min_c_] / self.ratio) # Create the clustering object kmeans = KMeans(n_clusters=num_samples, random_state=random_state) kmeans.set_params(**self.kwargs) # Start with the minority class X_min = X[y == self.min_c_] y_min = y[y == self.min_c_] # All the minority class samples will be preserved X_resampled = X_min.copy() y_resampled = y_min.copy() # Loop over the other classes under picking at random for key in self.stats_c_.keys(): # If the minority class is up, skip it. if key == self.min_c_: continue # Find the centroids via k-means kmeans.fit(X[y == key]) centroids = kmeans.cluster_centers_ # Concatenate to the minority class X_resampled = np.concatenate((X_resampled, centroids), axis=0) y_resampled = np.concatenate((y_resampled, np.array([key] * num_samples)), axis=0) self.logger.info('Under-sampling performed: %s', Counter( y_resampled)) return X_resampled, y_resampled
class ClusterCentroids(BaseUnderSampler): """Perform under-sampling by generating centroids based on clustering methods. Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples. Read more in the :ref:`User Guide <cluster_centroids>`. Parameters ---------- {sampling_strategy} {random_state} estimator : object, optional(default=KMeans()) Pass a :class:`sklearn.cluster.KMeans` estimator. voting : str, optional (default='auto') Voting strategy to generate the new samples: - If ``'hard'``, the nearest-neighbors of the centroids found using the clustering algorithm will be used. - If ``'soft'``, the centroids found by the clustering algorithm will be used. - If ``'auto'``, if the input is sparse, it will default on ``'hard'`` otherwise, ``'soft'`` will be used. .. versionadded:: 0.3.0 n_jobs : int, optional (default=1) The number of threads to open if possible. ratio : str, dict, or callable .. deprecated:: 0.4 Use the parameter ``sampling_strategy`` instead. It will be removed in 0.6. Notes ----- Supports multi-class resampling by sampling each class independently. Examples -------- >>> from collections import Counter >>> from sklearn.datasets import make_classification >>> from imblearn.under_sampling import \ ClusterCentroids # doctest: +NORMALIZE_WHITESPACE >>> X, y = make_classification(n_classes=2, class_sep=2, ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 900, 0: 100}}) >>> cc = ClusterCentroids(random_state=42) >>> X_res, y_res = cc.fit_resample(X, y) >>> print('Resampled dataset shape %s' % Counter(y_res)) ... # doctest: +ELLIPSIS Resampled dataset shape Counter({{...}}) """ def __init__(self, sampling_strategy='auto', random_state=None, estimator=None, voting='auto', n_jobs=1, ratio=None): super(ClusterCentroids, self).__init__( sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.estimator = estimator self.voting = voting self.n_jobs = n_jobs def _validate_estimator(self): """Private function to create the KMeans estimator""" if self.estimator is None: self.estimator_ = KMeans( random_state=self.random_state, n_jobs=self.n_jobs) elif isinstance(self.estimator, KMeans): self.estimator_ = clone(self.estimator) else: raise ValueError('`estimator` has to be a KMeans clustering.' ' Got {} instead.'.format(type(self.estimator))) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == 'hard': nearest_neighbors = NearestNeighbors(n_neighbors=1) nearest_neighbors.fit(X, y) indices = nearest_neighbors.kneighbors( centroids, return_distance=False) X_new = safe_indexing(X, np.squeeze(indices)) else: if sparse.issparse(X): X_new = sparse.csr_matrix(centroids, dtype=X.dtype) else: X_new = centroids y_new = np.array([target_class] * centroids.shape[0], dtype=y.dtype) return X_new, y_new def _fit_resample(self, X, y): self._validate_estimator() if self.voting == 'auto': if sparse.issparse(X): self.voting_ = 'hard' else: self.voting_ = 'soft' else: if self.voting in VOTING_KIND: self.voting_ = self.voting else: raise ValueError("'voting' needs to be one of {}. Got {}" " instead.".format(VOTING_KIND, self.voting)) X_resampled, y_resampled = [], [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{'n_clusters': n_samples}) self.estimator_.fit(X[y == target_class]) X_new, y_new = self._generate_sample( X, y, self.estimator_.cluster_centers_, target_class) X_resampled.append(X_new) y_resampled.append(y_new) else: target_class_indices = np.flatnonzero(y == target_class) X_resampled.append(safe_indexing(X, target_class_indices)) y_resampled.append(safe_indexing(y, target_class_indices)) if sparse.issparse(X): X_resampled = sparse.vstack(X_resampled) else: X_resampled = np.vstack(X_resampled) y_resampled = np.hstack(y_resampled) return X_resampled, np.array(y_resampled, dtype=y.dtype)
def Mahalanobis_stratified_CV(data, y, pca_var, nfolds): import numpy as np y = np.array(y) #Get numerical and categorical column indices def IsNumeric(data): data_types = data.dtypes index = 0 for i in data_types: if (i == "int64" or i == "float64"): data_types[index] = True else: data_types[index] = False index += 1 return data_types #Obtain a centered / scaled matrix with numerical values #n factor levels -> (0, ... , n-1) numerical_data = np.empty(shape=data.shape) data_type_index = IsNumeric(data) for i in range(0, data.shape[1]): if data_type_index[i]: numerical_data[:, i] = data.iloc[:, i] else: levels = data.iloc[:, i].unique() dummy_value = 0 for j in levels: index = np.where(data.iloc[:, i] == j) numerical_data[index, i] = dummy_value dummy_value += 1 numerical_data = (numerical_data - np.mean( numerical_data, axis=0)) / np.std(numerical_data, axis=0) #PCA decomposition from sklearn.decomposition import PCA pca = PCA(n_components=pca_var, svd_solver='full') numerical_data_PCA = pca.fit_transform(numerical_data) numerical_data_PCA = (numerical_data_PCA - np.mean( numerical_data_PCA, axis=0)) / np.std(numerical_data_PCA, axis=0) from sklearn.cluster import KMeans from multiprocessing.dummy import Pool import functools import multiprocessing #Chooses the best number of clusers based on the best silhouette score #minimum group size of cluster >= nfolds #best silhouette score kmeans = KMeans() clusters = kmeans.fit_predict(numerical_data) ncore = max(multiprocessing.cpu_count() - 2, 1) def min_cluster_size(n_clusters, kmeans, X): from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score kmeans.set_params(n_clusters=n_clusters, n_jobs=ncore, n_init=30) clusters = kmeans.fit_predict(X) score = silhouette_score(X, clusters) ids = np.unique(clusters) min_size = np.empty(shape=len(ids)) count = 0 for i in ids: min_size[count] = np.shape(np.where(clusters == i))[1] count += 1 min_size = min_size.astype(int) return [score, np.min(min_size)] results = np.empty(shape=(nfolds, 2)) for i in range(2, nfolds + 2): results[(i - 2), :] = min_cluster_size(i, kmeans, numerical_data_PCA) treshold = int(len(y) / nfolds) accepted_n_clusters = np.where(results[:, 1] >= treshold)[0] if len(accepted_n_clusters) == 0: accepted_n_clusters = np.array(list(range(2, nfolds + 2))) best_accepted_n_clusters = int(accepted_n_clusters[np.argsort( -results[accepted_n_clusters, 0])[0]]) + 2 kmeans.set_params(n_clusters=best_accepted_n_clusters, n_jobs=ncore) clusters = kmeans.fit_predict(numerical_data_PCA) #Quantile-based k-fold stratification def stratified_sample(y, nfolds, index): y = np.array(y[index]) n = max(int(len(y) / nfolds), 1) q = np.linspace(0, 1, num=(n + 1)) q = np.quantile(y, q) q[0] = q[0] - 1 q[len(q) - 1] = q[len(q) - 1] + 1 out = [[] for i in range(nfolds)] for i in range(0, n): index_temp = np.where((y >= q[i]) & (y < q[i + 1]))[0] index_temp = index[0][index_temp] np.random.shuffle(index_temp) folds = np.array_split(index_temp, nfolds) for j in range(0, nfolds): out[j].append(folds[j]) for i in range(0, nfolds): out[i] = np.concatenate(out[i]) return out #Fuse two list of indices together def fuse(f1, f2): f3 = [] for i in range(0, len(f1)): f3.append(np.concatenate([f1[i], f2[i]])) return np.array(f3) #Initiate fold list, then fill it sequentially by looping through clusters folds = np.array([[] for i in range(0, nfolds)]) for i in range(0, best_accepted_n_clusters): index = np.where(clusters == i) folds = fuse(folds, stratified_sample(y, nfolds, index)) np.random.shuffle(folds) for i in range(0, nfolds): folds[i] = folds[i].astype(int) y_vecs = [] for i in folds: y_vecs.append(y[i]) metrics = np.empty(shape=(len(y_vecs), 4)) count = 0 from scipy.stats import kurtosis from scipy.stats import skew for i in y_vecs: metrics[count, 0] = np.mean(np.array(i)) metrics[count, 1] = np.std(np.array(i)) metrics[count, 2] = skew(np.array(i)) metrics[count, 3] = kurtosis(np.array(i)) count += 1 metrics_final = np.empty(shape=4) for i in range(0, 4): metrics_final[i] = np.std(metrics[:, i]) / np.mean(metrics[:, i]) print("Coefficient of variation (MEAN): " + str(round(metrics_final[0], 5))) print("Coefficient of variation (SD): " + str(round(metrics_final[1], 5))) print("Coefficient of variation (SKEW): " + str(round(metrics_final[2], 5))) print("Coefficient of variation (KURT): " + str(round(metrics_final[3], 5))) return folds
class DataCluster(): def __init__(self, nCluster, minDist, nQuatCluster, minQuatDist): print 'Init DataCluster.' self.set_params(nCluster, minDist, nQuatCluster, minQuatDist) def set_params(self, nCluster, minDist, nQuatCluster, minQuatDist): self.nCluster = nCluster self.fMinDist = minDist self.nQuatCluster = nQuatCluster self.fMinQuatDist = minQuatDist self.ml = KMeans(n_clusters=nCluster, max_iter=300, n_jobs=6) def readData(self): print 'Read data manually.' data_start=0 data_finish=1000 #'end' model = 'bed' subject='sub6_shaver' print 'Starting to convert data!' self.runData = dr.DataReader(subject=subject,data_start=data_start,data_finish=data_finish,model=model) #dr_obs = dr.DataReader(subject=subject,data_start=data_start,data_finish=data_finish,model=model) #self.runData = dr_obs.get_raw_data(self) def mat_to_pos_quat(self, raw_data): raw_pos = np.zeros((len(raw_data),3)) #array raw_quat = np.zeros((len(raw_data),4)) #-----------------------------------------------------------# ## Decompose data into pos,quat pairs for i in xrange(len(raw_data)): raw_pos[i,:] = np.array([raw_data[i][0,3],raw_data[i][1,3],raw_data[i][2,3]]) raw_quat[i,:] = tft.quaternion_from_matrix(raw_data[i]) # order should be xyzw because ROS uses xyzw order. return raw_pos, raw_quat def pos_clustering(self, raw_pos): while True: dict_params={} dict_params['n_clusters']=self.nCluster self.ml.set_params(**dict_params) self.ml.fit(raw_pos) # co-distance matrix bReFit = False co_pos_mat = np.zeros((self.nCluster,self.nCluster)) for i in xrange(self.nCluster): # For refitting if bReFit == True: break for j in xrange(i, self.nCluster): if i==j: co_pos_mat[i,j] = 1000000 # to avoid minimum check continue co_pos_mat[i,j] = co_pos_mat[j,i] = np.linalg.norm(self.ml.cluster_centers_[i] - self.ml.cluster_centers_[j]) if co_pos_mat[i,j] < self.fMinDist: bReFit = True break if bReFit == True: self.nCluster -= 1 print "New # of clusters: ", self.nCluster continue else: break raw_pos_index = self.ml.fit_predict(raw_pos) return raw_pos_index # Return a list of clustered index. def grouping(self, raw_data): print 'Start clustering.' print raw_data.shape #-----------------------------------------------------------# ## Initialization raw_pos, raw_quat = self.mat_to_pos_quat(raw_data) #-----------------------------------------------------------# ## K-mean Clustring by Position raw_pos_index = self.pos_clustering(raw_pos) return raw_pos_index def clustering(self, raw_data): print 'Start clustering.' print raw_data.shape #-----------------------------------------------------------# ## Initialization raw_pos, raw_quat = self.mat_to_pos_quat(raw_data) #-----------------------------------------------------------# ## K-mean Clustering by Position raw_pos_index = self.pos_clustering(raw_pos) pos_clustered_group = [] for i in xrange(self.nCluster): raw_group = [] for j in xrange(len(raw_data)): if raw_pos_index[j] == i: if raw_group == []: raw_group = np.array([np.hstack([raw_pos[j],raw_quat[j]])]) else: raw_group = np.vstack([raw_group, np.hstack([raw_pos[j],raw_quat[j]])]) pos_clustered_group.append(raw_group) print "Number of pos groups: ", len(pos_clustered_group) #-----------------------------------------------------------# ## Grouping by orientation clustered_group = [] for group in pos_clustered_group: # samples X = group[:,3:] ## print "Total X: ", X.shape[0], len(X) # Clustering parameters nQuatCluster = self.nQuatCluster kmsample = nQuatCluster # 0: random centres, > 0: kmeanssample kmdelta = .001 kmiter = 10 metric = "quaternion" # "chebyshev" = max, "cityblock" L1, Lqmetric # the number of clusters should be smaller than the number of samples if nQuatCluster > len(X): nQuatCluster = len(X) kmsample = len(X) # Clustering while True: centres, xtoc, dist = km.kmeanssample( X, nQuatCluster, nsample=kmsample, delta=kmdelta, maxiter=kmiter, metric=metric, verbose=0 ) # co-distance matrix bReFit = False co_pos_mat = np.zeros((nQuatCluster,nQuatCluster)) for i in xrange(nQuatCluster): # For refitting if bReFit == True: break for j in xrange(i, nQuatCluster): if i==j: co_pos_mat[i,j] = 1000000 # to avoid minimum check continue co_pos_mat[i,j] = co_pos_mat[j,i] = ut.quat_angle(centres[i],centres[j]) if co_pos_mat[i,j] < self.fMinQuatDist: bReFit = True break if bReFit == True: nQuatCluster -= 1 ## print "New # of clusters ", nQuatCluster, " in a sub group " continue else: break for i in xrange(nQuatCluster): raw_group = [] for j in xrange(len(group)): if xtoc[j] == i: if raw_group == []: raw_group = np.array([group[j,:]]) else: raw_group = np.vstack([raw_group, group[j,:]]) clustered_group.append(raw_group) print "Number of pos+quat groups: ", len(clustered_group) #-----------------------------------------------------------# ## Averaging avg_clustered_data = [] num_clustered_data = [] count = 0 for i,g in enumerate(clustered_group): if len(g)==0: continue count += len(g) ## print "Number of sub samples: ", len(g) # Position pos_sum = np.array([0.,0.,0.]) for j,s in enumerate(g): pos_sum += s[0:3] if j==0: quat_array = np.array([s[3:]]) else: quat_array = np.vstack([quat_array, s[3:]]) pos_avg = pos_sum/float(len(g)) # Quaternion quat_avg = qt.quat_avg( quat_array ) avg_clustered_data.append([pos_avg, quat_avg]) num_clustered_data.append([len(g)]) ## print "total: ", count # Reshape the pairs into tranformation matrix for i, g in enumerate(avg_clustered_data): mat = tft.quaternion_matrix(g[1]) mat[0,3] = g[0][0] mat[1,3] = g[0][1] mat[2,3] = g[0][2] if i==0: clustered_data = np.array([mat]) else: clustered_data = np.vstack([clustered_data, np.array([mat])]) print "Final clustered data: ", clustered_data.shape, len(num_clustered_data) return clustered_data, num_clustered_data, len(pos_clustered_group) # X is a set of quaternion def q_image_axis_angle(self, X): print "Number of data: ", X.shape[0] angle_array = np.zeros((X.shape[0],1)) direc_array = np.zeros((X.shape[0],3)) for i in xrange(len(X)): angle, direc = qt.quat_to_angle_and_axis(X[i,:]) angle_array[i,0] = angle direc_array[i,:] = direc # Normalize angles angle_array = (angle_array)/np.pi*180.0 # matplot setup fig = plt.figure(figsize=(12,12)) plt.rc('text', usetex=True) plt.rc('font', family='serif') ax = fig.add_subplot(111, projection='3d') # Plot a sphere r = 0.999 u = np.linspace(0, 2 * np.pi, 120) v = np.linspace(0, np.pi, 60) x = np.outer(np.cos(u), np.sin(v)) y = np.outer(np.sin(u), np.sin(v)) z = np.outer(np.ones(np.size(u)), np.cos(v)) ax.plot_surface(x*r, y*r, z*r, rstride=1, cstride=1, color='c', alpha = 0.4, linewidth = 0) # Plot quaternions cmap = plt.cm.hsv sc = ax.scatter(direc_array[:,0],direc_array[:,1],direc_array[:,2],c=angle_array,cmap=cmap,vmin=-180.0, vmax=180.0,s=100) #edgecolor='none' cbar = plt.colorbar(sc, ticks=np.arange(-180,180+30,30)) ax.set_aspect("equal") ax.set_xlim([-1.0,1.0]) ax.set_ylim([-1.0,1.0]) ax.set_zlim([-1.0,1.0]) font_dict={'fontsize': 30, 'family': 'serif'} ax.set_xlabel('x', fontdict=font_dict) ax.set_ylabel('y', fontdict=font_dict) ax.set_zlabel('z', fontdict=font_dict) ax.view_init(20,80) plt.ion() plt.show() #ax.mouse_init() ut.get_keystroke('Hit a key to proceed next') return # X is a set of quaternion # Y is a set of label def q_image_axis_cluster(self, X, Y): print "Number of data: ", X.shape[0] angle_array = np.zeros((X.shape[0],1)) direc_array = np.zeros((X.shape[0],3)) for i in xrange(len(X)): angle, direc = qt.quat_to_angle_and_axis(X[i,:]) angle_array[i,0] = angle direc_array[i,:] = direc ## # Normalize angles angle_array = (angle_array)/np.pi*180.0 # Normalize labels max_label = float(np.max(Y)) fY = np.zeros((len(Y),1)) if max_label != 0: for i in xrange(len(Y)): fY[i] = float(Y[i])/max_label # matplot setup fig = plt.figure(figsize=(24,12)) plt.rc('text', usetex=True) plt.rc('font', family='serif') #-------------- matplot 1 -------------- ax = fig.add_subplot(121, projection='3d') font_dict={'fontsize': 45, 'family': 'serif'} ax.set_title("QuTEM distribution", fontdict=font_dict) # Plot a sphere r = 1.0 u = np.linspace(0, 2 * np.pi, 120) v = np.linspace(0, np.pi, 60) x = np.outer(np.cos(u), np.sin(v)) y = np.outer(np.sin(u), np.sin(v)) z = np.outer(np.ones(np.size(u)), np.cos(v)) ax.plot_surface(x*r, y*r, z*r, rstride=1, cstride=1, color='c', alpha = 0.4, linewidth = 0) # Plot quaternions cmap = plt.cm.hsv sc = ax.scatter(direc_array[:,0],direc_array[:,1],direc_array[:,2],c=angle_array,cmap=cmap,vmin=-180.0, vmax=180.0,s=100) #edgecolor='none' cbar = plt.colorbar(sc, ticks=np.arange(-180,180+30,30)) ## cbar.set_clim(-180.0, 180.0) ax.set_aspect("equal") ax.set_xlim([-1.0,1.0]) ax.set_ylim([-1.0,1.0]) ax.set_zlim([-1.0,1.0]) font_dict={'fontsize': 30, 'family': 'serif'} ax.set_xlabel('x', fontdict=font_dict) ax.set_ylabel('y', fontdict=font_dict) ax.set_zlabel('z', fontdict=font_dict) ax.view_init(20,40) #-------------- matplot 2 -------------- ax = fig.add_subplot(122, projection='3d') font_dict={'fontsize': 45, 'family': 'serif'} ax.set_title("Clustering", fontdict=font_dict) # Plot a sphere r = 0.92 u = np.linspace(0, 2 * np.pi, 120) v = np.linspace(0, np.pi, 60) x = np.outer(np.cos(u), np.sin(v)) y = np.outer(np.sin(u), np.sin(v)) z = np.outer(np.ones(np.size(u)), np.cos(v)) ## ax.plot_surface(x*r, y*r, z*r, rstride=1, cstride=1, color='c', alpha = 1.0, linewidth = 0) # Plot quaternions cmap = plt.cm.jet sc = ax.scatter(direc_array[:,0],direc_array[:,1],direc_array[:,2],c=Y,vmin=0,vmax=abs(Y).max(),s=100) ## plt.colorbar(sc) ax.set_aspect("equal") ax.set_xlim([-1.0,1.0]) ax.set_ylim([-1.0,1.0]) ax.set_zlim([-1.0,1.0]) font_dict={'fontsize': 30, 'family': 'serif'} ax.set_xlabel('x', fontdict=font_dict) ax.set_ylabel('y', fontdict=font_dict) ax.set_zlabel('z', fontdict=font_dict) ax.view_init(20,40) plt.ion() plt.show() #ax.mouse_init() ut.get_keystroke('Hit a key to proceed next') return def test(self, raw_data): print 'Start clustering.' print raw_data.shape N = 1000 #-----------------------------------------------------------# ## Initialization raw_pos = np.zeros((N,3)) #array raw_quat = np.zeros((N,4)) #-----------------------------------------------------------# ## Decompose data into pos,quat pairs for i in xrange(N): raw_pos[i,:] = np.array([0,0,0]) ## raw_quat = qt.quat_random( N ) quat_mean = np.array([1.,0.,0.,1.5]); raw_quat = qt.quat_QuTem( quat_mean/np.linalg.norm(quat_mean), N, [0.03,0.3,0.3,1.0] ) ## quat_mean = np.array([0.,1.,0.,-1.5]); ## raw_quat2 = qt.quat_QuTem( quat_mean/np.linalg.norm(quat_mean), N/2.0, [0.1,1.0,0.1,1.0] ) ## raw_quat = np.vstack([raw_quat1,raw_quat2]) ## raw_quat1 = np.array([[1., 0., 0., 0.], ## [1., 0.1, 0., 0.], ## [1., 0., 0.1, 0.], ## [1., 0., 0., 0.1], ## [1., 0.2, 0., 0.], ## [1., 0., 0.2, 0.], ## [1., 0., 0., 0.2], ## [1.1, 0.1, 0., 0.], ## [1.1, 0., 0.1, 0.], ## [1.1, 0., 0., 0.1]]) ## raw_quat2 = np.array([[0., 0., 1., 0.], ## [0.1, 0., 1.1, 0.], ## [0.1, 0.1, 1., 0.], ## [0., 1., 0., 0.], ## [0.1, 1., 0.1, 0.], ## [0.1, 1.1, 0., 0.], ## [0.1, 1., 0.4, 0.], ## [0.1, 1., 1.1, 0.2], ## [0.1, 1., 1.4, 0.], ## [1.1, 1., 0.1, 0.2], ## [1.1, 1.1, 0.1, 0.1] ## ]) ## raw_quat = np.vstack([raw_quat1,raw_quat2]) for i in xrange(len(raw_quat)): raw_quat[i,:] /= np.linalg.norm(raw_quat[i,:]) #-----------------------------------------------------------# pos_clustered_group = [] raw_group = np.hstack([raw_pos,raw_quat]) pos_clustered_group.append(raw_group) print "Number of pos groups: ", len(pos_clustered_group) #-----------------------------------------------------------# ## Grouping by orientation clustered_group = [] for group in pos_clustered_group: # samples X = group[:,3:] # Clustering parameters nQuatCluster = self.nQuatCluster kmsample = nQuatCluster # 0: random centres, > 0: kmeanssample kmdelta = .001 kmiter = 10 metric = "quaternion" # "chebyshev" = max, "cityblock" L1, Lqmetric # the number of clusters should be smaller than the number of samples if nQuatCluster > len(X): nQuatCluster = len(X) kmsample = len(X) # Clustering while True: centres, xtoc, dist = km.kmeanssample( X, nQuatCluster, nsample=kmsample, delta=kmdelta, maxiter=kmiter, metric=metric, verbose=0 ) # co-distance matrix bReFit = False co_pos_mat = np.zeros((nQuatCluster,nQuatCluster)) for i in xrange(nQuatCluster): # For refitting if bReFit == True: break for j in xrange(i, nQuatCluster): if i==j: co_pos_mat[i,j] = 1000000 # to avoid minimum check continue co_pos_mat[i,j] = co_pos_mat[j,i] = ut.quat_angle(centres[i],centres[j]) if co_pos_mat[i,j] < self.fMinQuatDist: bReFit = True break if bReFit == True: nQuatCluster -= 1 ## print "New # of clusters ", nQuatCluster, " in a sub group " continue else: break ## for i in xrange(nQuatCluster): ## raw_group = [] ## for j in xrange(len(group)): ## if xtoc[j] == i: ## if raw_group == []: ## raw_group = np.array([group[j,:]]) ## else: ## raw_group = np.vstack([raw_group, group[j,:]]) ## clustered_group.append(raw_group) print "Number of pos+quat groups: ", len(clustered_group) self.q_image_axis_cluster(X, xtoc)