def test_normalize_transform(self): """Is the transformed batch file different from the original one?""" inputs = { 'biom_file': None, 'cluster': ['Affinity'], 'nclust': ['4'], 'otu_meta': None, 'otu_table': ['otu_bananas.txt'], 'prefix': None, 'sample_data': None, 'split': 'Rocket Science', 'tax_table': ['tax_bananas.txt'], 'name': ['test'], 'fp': os.path.dirname(massoc.__file__)[:-7].replace('\\', '/') } batch = Batch(testbiom, inputs) clrbatch = batch.normalize_transform(mode="clr") self.assertFalse(batch.otu['test'] == clrbatch.otu['test'])
def generate_cluster_figures(self): """Generates figures for diagnostics canvas.""" from massoc.scripts.batch import Batch from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AffinityPropagation from sklearn.mixture import GaussianMixture from sklearn.metrics import silhouette_score from sklearn.decomposition import PCA nums = list(range(2, 5)) try: file = self.file_list.GetSelection() file = self.file_list.GetString(file) x = 'init' biomfile = {x: biom.load_table(file)} algo = self.cluster_choice.GetSelection() algo = self.cluster_choice.GetString(algo) inputs = {'biom_file': [file], 'cluster': [algo]} normbatch = Batch(biomfile, inputs) normbatch = normbatch.normalize_transform(mode='clr') norm_table = normbatch.otu[x] topscore = 0 bestcluster = [1] * len(norm_table.ids()) data = csr_matrix.todense(norm_table.matrix_data) data = np.matrix.transpose(data) data = PCA(n_components=2).fit_transform(data) randomclust = np.random.randint(2, size=len(data)) sh_score = [silhouette_score(data, randomclust)] # K-means clustering, tests 2-4 clusters if inputs['cluster'][0] == 'K-means': for i in nums: clusters = KMeans(i).fit_predict(data) silhouette_avg = silhouette_score(data, clusters) sh_score.append(silhouette_avg) topscore = int(np.argmax(sh_score) + 1) bestcluster = KMeans(topscore).fit_predict(data) # DBSCAN clustering, automatically finds optimal cluster size if inputs['cluster'][0] == 'DBSCAN': bestcluster = DBSCAN().fit_predict(data) topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0) # Gaussian Mixture Model (gmm) probability distribution if inputs['cluster'][0] == 'Gaussian': for i in nums: fit = GaussianMixture(i).fit(data) clusters = fit.predict(data) silhouette_avg = silhouette_score(data, clusters) sh_score.append(silhouette_avg) topscore = int(np.argmax(sh_score) + 1) bestfit = GaussianMixture(topscore).fit(data) bestcluster = bestfit.predict(data) # Spectral Clustering if inputs['cluster'][0] == 'Spectral': for i in nums: clusters = SpectralClustering(i).fit_predict(data) silhouette_avg = silhouette_score(data, clusters) sh_score.append(silhouette_avg) topscore = int(np.argmax(sh_score) + 1) bestcluster = SpectralClustering(topscore).fit_predict(data) # Affinity Propagation clustering if inputs['cluster'] == 'Affinity': bestcluster = AffinityPropagation().fit_predict(data) topscore = len(set(bestcluster)) - (1 if -1 in bestcluster else 0) if max(sh_score) < 0.25: raise ValueError("Silhouette score too low: please try a different algorithm. " "Your data may not be suitable for clustering.") for i in range(topscore): mask, = np.where(bestcluster == i) for j in mask: norm_table._sample_metadata[j]['cluster'] = inputs['cluster'][0] + '_' + str(i) x, y = zip(*data) self.prev.scatter(x, y, bestcluster) self.canvas1.draw() except Exception: logger.error("Failed to generate figures. ", exc_info=True)