def sort_bacteria_in_cluster(self): """: Sorts the bacteria in the biofilm into bac_clusters. Clusters are calculated with the OPTICS algorithm. Return value is a list of the bac_clusters containing the respective bacteria. """ # sort data in the format of a 3xN matrix where N is the number of bacteria. data = self.position_matrix.transpose() model = OPTICS(min_samples=2, metric='euclidean') model.fit_predict(data) clusters = [[] for _ in range(0, len(np.unique(model.labels_)))] for bacteria, index in zip(self.bacteria, model.labels_): # sort bacteria in bac_clusters according to the assigned labels clusters[index].append(bacteria) # check if all bacteria where assigned sum = 0 for cluster in clusters: sum += len(cluster) if sum != len(self.bacteria): raise ValueError(f"{abs(sum - len(self.bacteria))} bacteria where not sorted in a cluster.") return clusters
def setUp(self): n_points_per_cluster = 250 np.random.seed(0) C1 = np.zeros((n_points_per_cluster, 3)) C2 = np.zeros((n_points_per_cluster, 3)) C3 = np.zeros((n_points_per_cluster, 3)) C4 = np.zeros((n_points_per_cluster, 3)) C5 = np.zeros((n_points_per_cluster, 3)) C6 = np.zeros((n_points_per_cluster, 3)) C1[:, 1:3] = ([-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)) C2[:, 1:3] = ([4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)) C3[:, 1:3] = ([0, -2] + .2 * np.random.randn(n_points_per_cluster, 2)) C4[:, 1:3] = ([-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)) C5[:, 1:3] = ([3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)) C6[:, 1:3] = ([5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)) X = np.vstack( (C1[:, 1:3], C2[:, 1:3], C3[:, 1:3], C4[:, 1:3], C5[:, 1:3], C6[:, 1:3])) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # Run the fit clust.fit(X) self.tbhg = modeling.TBH() self.tbhg.optics = clust self.tbhg.locH = (C1, C2, C3, C4, C5, C6) # self.tbhg = TBHG(clust) pass
def find_cluster_indices(output_seqs, batch_size, datatype="train_y"): ## Cluster the output set of sequences and chooose sequences randomly from each cluster ### print("Clustering {}".format(datatype)) features = convert_to_array(output_seqs) from sklearn.cluster import DBSCAN clustering_type = OPTICS(min_samples=2, min_cluster_size=2) #DBSCAN(eps=0.5, min_samples=2).fit(features) #OPTICS(min_samples=2, min_cluster_size=2) cluster_labels = clustering_type.fit_predict(features) print("Number of clusters: {}".format(str(len(list(set(cluster_labels)))))) x = list() y = list() cluster_indices_dict = dict() for i, l in enumerate(cluster_labels): x.append(output_seqs[i]) y.append(l) if l not in cluster_indices_dict: cluster_indices_dict[l] = list() cluster_indices_dict[l].append(i) scatter_df = pd.DataFrame(list(zip(x, y)), columns=["output_seqs", "clusters"]) scatter_df.to_csv( "data/generated_files/clustered_output_seqs_data_{}.csv".format( datatype)) return cluster_labels, cluster_indices_dict, scatter_df
def visual(c, X, y): from sklearn.cluster import OPTICS cluster_object = OPTICS(min_cluster_size=100) y_pred = cluster_object.fit_predict(X) colors = [ 'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue' ] clusters = np.unique(y_pred) print("Cluster Labels") print(clusters) print("Evaluation") evaluation_labels(y, y_pred) evaluation(X, y_pred) for cluster in clusters: row_idx = np.where(y == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Dataset') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show() plt.figure() for cluster in clusters: row_idx = np.where(y_pred == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Cluster') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show()
def perform_optics_clustering(data, program_options: Options) -> ClusteredData: # The data that will be returned clustered_data = ClusteredData(data, list(), program_options=program_options) op = OPTICS(min_samples=program_options.OPTICS_MIN_SAMPLES, n_jobs=-1) op.fit(data) optic_labels = op.labels_ for k in range(optic_labels.max() + 1): class_members = optic_labels == k nodes_in_cluster = data[class_members] # optics has no way of telling you the final cluster centres so have to calculate it yourself cluster_centre = nodes_in_cluster.mean(axis=0) cluster = Cluster(cluster_centre=cluster_centre, nodes=nodes_in_cluster, cluster_type=ClusterType.FULL_CLUSTER, program_options=program_options) clustered_data.add_cluster(cluster) if optic_labels.min() == -1: class_members = optic_labels == -1 # There are unclassified nodes unclassified_nodes = data[class_members] for unclassified_node in unclassified_nodes: cluster_to_add = Cluster(unclassified_node, [unclassified_node], cluster_type=ClusterType.UNCLASSIFIED_NODE_CLUSTER, program_options=program_options) clustered_data.add_unclassified_node(cluster_to_add) return clustered_data
def get_clustered_data(data_matrix, clustering_algorithm=model_constants.KMEANS, distance_metric='euclidean', num_clusters=3): if clustering_algorithm.lower() == model_constants.AFFINITY_PROP: aff_prop = AffinityPropagation(affinity=distance_metric) aff_prop.fit(data_matrix) return aff_prop.labels_, aff_prop elif clustering_algorithm.lower() == model_constants.DBSCAN: dbscan = DBSCAN(metric=distance_metric) dbscan.fit(data_matrix) return dbscan.labels_, dbscan elif clustering_algorithm.lower() == model_constants.OPTICS: optics = OPTICS(metric=distance_metric) optics.fit(data_matrix) return optics.labels_, optics elif clustering_algorithm.lower() == model_constants.MEANSHIFT: mean_shift = MeanShift() mean_shift.fit(data_matrix) return mean_shift.labels_, mean_shift elif clustering_algorithm.lower() == model_constants.BIRCH: birch = Birch(n_clusters=num_clusters) birch.fit(data_matrix) return birch.labels_, birch elif clustering_algorithm.lower() == model_constants.AGGLOMERATIVE: agglomerative = AgglomerativeClustering(n_clusters=num_clusters, affinity=distance_metric) agglomerative.fit(data_matrix) return agglomerative.labels_, agglomerative else: kmeans = KMeans(n_clusters=num_clusters, random_state=42) kmeans.fit(data_matrix) return kmeans.labels_, kmeans
def cameras(): conn, cursor = connect() query = "SELECT title, x, y, rstp, F, current_frame FROM cameras" cursor.execute(query) cameras = cursor.fetchall() query = "SELECT * FROM occurrences WHERE DATE(`timestamp`)=CURDATE() AND e1 IS NOT NULL" cursor.execute(query) today_occurrences = cursor.fetchall() arr = [[to["e%i" % i] for i in range(1, 129)] for to in today_occurrences] arr = np.array(arr) model = OPTICS() model.fit(arr) indices = np.arange(len(today_occurrences)) result_occurrences = [] for i in range(np.max(model.labels_) + 1): person_indices = indices[model.labels_ == i] print(person_indices) if len(person_indices) < 4: continue index = np.random.choice(person_indices) result_occurrences.append('/'+'/'.join(today_occurrences[index]['human_picture'].split('/')[1:])) conn.close() print(len(result_occurrences)) return render_template('cameras.html', cameras=cameras, today_occurrences=result_occurrences)
def optics_clustering(principal_components, principal_df): final_df = pd.concat([principal_df], axis=1) model = OPTICS(eps=5, min_samples=2) # fit model and predict clusters yhat = model.fit_predict(principal_components) # retrieve unique clusters clusters = unique(yhat) final_df['Segment'] = model.labels_ # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples plt.scatter(principal_components[row_ix, 0], principal_components[row_ix, 1], s=75) final_df.rename({ 0: 'PC1', 1: 'PC2', 2: 'PC3', 'y': 'Race' }, axis=1, inplace=True) print(final_df) plt.title("OPTICS Clustering") add_race_labels(final_df) calc_silhouette(data=principal_components, prediction=yhat, n_clusters=len(clusters)) return final_df
def cluster_proteins_by_sim(prot_graph_fname): print('here') with open(prot_graph_fname, 'rb') as fd: nodes, adj_mat = pkl.load(fd) model = OPTICS(min_cluster_size=5, n_jobs=-1) clusters = model.fit_predict(adj_mat) print(Counter(clusters)) transformer = eGTM() x, y = transformer.fit_transform(adj_mat).T cmap = plt.get_cmap('jet', np.max(clusters) + 2) cmap.set_under('gray') fig, ax = plt.subplots() ax.scatter(x, y, c=clusters, s=10, cmap=cmap) outfile = os.path.join(os.path.dirname(prot_graph_fname), 'protein_egtm_clusters.png') plt.savefig(outfile) plt.close() transformer = TSNE(n_components=2, n_iter_without_progress=10) x, y = transformer.fit_transform(adj_mat).T cmap = plt.get_cmap('jet', np.max(clusters) + 2) cmap.set_under('gray') fig, ax = plt.subplots() ax.scatter(x, y, c=clusters, s=10, cmap=cmap) outfile = os.path.join(os.path.dirname(prot_graph_fname), 'protein_tsne_clusters.png') plt.savefig(outfile) plt.close()
def optics(params): distance_path='' distance_path+=params["distance_path"] print(distance_path) distance=np.loadtxt(distance_path,dtype=np.float32) print(distance.shape) #using default values, set metric to 'precomputed' op = OPTICS(eps=0.03, min_samples =10, metric='precomputed') #check db print(op) op.fit(distance) #get labels labels = op.labels_ print(labels,labels.shape) #get number of clusters no_clusters = len(set(labels)) - (1 if -1 in labels else 0) print(no_clusters,"no_clusters") #for i in range(no_clusters): #print('Cluster : ', np.nonzero(labels == i)[0]) #print(type(labels)) return_val=tuple(labels.tolist()) #print(type(return_val)) return return_val
def clustering(self, min_cluster_size=5, min_samples=3, eps=1, cpu_threads=-1): clust_matr = [] self.mols_and_aa = [] for i in range(len(alignment.alignment)): num_of_aa = alignment.alignment[i][self.col] if num_of_aa != '-' and alignment.molecules[i].amino_acids[num_of_aa].start is not None and alignment.molecules[i].amino_acids[num_of_aa].end is not None: self.mols_and_aa.append((alignment.molecules[i], num_of_aa)) clust_matr.append(np.hstack((alignment.molecules[i].amino_acids[num_of_aa].start, alignment.molecules[i].amino_acids[num_of_aa].end))) clust_matr = np.array(clust_matr) if self.method == 'optics': clusterer = OPTICS(metric='euclidean', n_jobs=cpu_threads, min_samples=min_samples) elif self.method == 'hdbscan': clusterer = hdbscan.HDBSCAN(metric='euclidean', min_cluster_size=min_cluster_size, min_samples=min_samples) elif self.method == 'dbscan': clusterer = DBSCAN(metric='euclidean', n_jobs=cpu_threads, eps=eps, min_samples=min_samples) db = clusterer.fit(clust_matr) self.lab = db.labels_ if list(self.lab).count(-1) == 0 and len(set(self.lab)) == 2 or list(self.lab).count(-1) == 1 and len(set(self.lab)) == 3: self.sil = silhouette_score(clust_matr[self.lab != -1], self.lab[self.lab != -1], metric='euclidean') dist_matr = np.array([[distance.euclidean(clust_matr[i], clust_matr[j]) for i in range(len(clust_matr))] for j in range(len(clust_matr))]) mean_diams_clusters = [dist_matr[self.lab == i].T[self.lab == j].mean() for i in set(self.lab) for j in set(self.lab) if i != j and i != -1 and j != -1] self.diam = max(mean_diams_clusters) self.score = self.sil * self.diam else: self.sil = None self.diam = None self.score = None
def plot_bacteria_as_clusters(data: pd.DataFrame, save_path: Path, save_fig: bool = False, time_point=None): if time_point is None: # set to last time step time_point = -1 position_matrix = [] for bac in data['position'].index: x, y, z = data['position'][bac][time_point][0], \ data['position'][bac][time_point][1], \ data['position'][bac][time_point][2] position_matrix.append([x, y, z]) fig = plt.figure() ax = Axes3D(fig) ax.scatter(data[:, 0], data[:, 1], data[:, 2], s=30) ax.view_init(azim=200) plt.show() # model = DBSCAN(eps=2.5, min_samples=2) model = OPTICS(min_samples=2, metric='euclidean') model.fit_predict(data) fig = plt.figure() ax = Axes3D(fig) ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=model.labels_, s=30) ax.view_init(azim=200) plt.show() if save_fig: path = Path(save_path).parent / 'cluster_plot.png' plt.savefig(path) plt.close(fig) else: plt.show()
def test_collect_Loactions(self): # TODO: mock optics or use namedtuple class OPTICS: def __init__(self): pass mockClusters = [[1,7], [8,15],[16,20],[0,25],[32,40], [30,49],[0,50],[85,98],[80,99],[0,99] ] optics = OPTICS() optics.cluster_hierarchy_ = mockClusters r = util.build_tree(optics) f = lambda cnodes: [cnode.cluster for cnode in cnodes] actual1 = f(inferring.collect_locations(1, r)) actual2 = f(inferring.collect_locations(2, r)) actual3 = f(inferring.collect_locations(3, r)) actual4 = f(inferring.collect_locations(4, r)) actual = [actual1, actual2, actual3, actual4] expected1 = [[0,50], [80, 99]] expected2 = [[0,25], [30, 49], [85,98]] expected3 = [[1,7], [8,15], [16,20], [32,40]] expected4 = [] expected = [expected1, expected2, expected3, expected4] for i, r, e in zip(range(len(actual)), actual, expected): with self.subTest(i=i): self.assertCountEqual(r, e, "i")
def optics_fit_predict(X, min_samples=50, cluster_method='dbscan', eps=2): """Perform OPTICS clustering Extracts an ordered list of points and reachability distances, and performs initial clustering using ``max_eps`` distance specified at OPTICS object instantiation. Parameters ---------- X : array, shape (n_samples, n_features), or (n_samples, n_samples) min_samples : The number of samples in a neighborhood for a point to be considered as a core point. cluster_method : 'dbscan' by default. Other available: 'xi' eps : The maximum distance between two samples for one to be considered as in the neighborhood of the other. Returns ------- labels: Prediction/labels """ opt = OPTICS(min_samples=min_samples, cluster_method=str(cluster_method)) opt.fit(X) labels = cluster_optics_dbscan(reachability=opt.reachability_, core_distances=opt.core_distances_, ordering=opt.ordering_, eps=eps) return labels
def fit_model(self): ''' Fit model and save if not pretrained :return: None ''' if self.show_plots: self.elbow_method() if not self.pretrained: if self.method == "kmeans": self.model = KMeans(n_clusters=self.n_clusters) self.model.fit(self.data) elif self.method == "dbscan": self.model = DBSCAN(metric=self.metric, eps=0.15) self.model.fit(self.data) elif self.method == "optics": self.model = OPTICS(metric=self.metric) self.model.fit(self.data) elif self.method == "hierarichal": self.model = linkage(self.data, metric=self.metric) if self.save: pickle.dump(self.model, open("trained_models/%s_model.pkl" % self.method, "wb")) else: self.model = pickle.load(open("trained_models/%s_model.pkl" % self.method, "rb"))
def find_pairs(self): """ Uses OPTICS algorithim to find clusters of similar securities within PCA component space. Once clusters labels are assigned, function generates series of tuples containing unique pairs of securities within the same cluster. """ if self.returns_reduced is None: raise ValueError("returns_reduced not found: must run \ .reduce_PCA() before this function") # Initialize and fit OPTICS cluster to PCA components clustering = OPTICS() clustering.fit(self.components_.T) # Create cluster data frame and identify trading pairs clusters = pd.DataFrame({ 'security': self.securities, 'cluster': clustering.labels_ }) # clusters with label == -1 are 'noise' clusters = clusters[clusters['cluster'] != -1] # Group securities by cluster and flatten list of combination lists groups = clusters.groupby('cluster') combos = list(groups['security'].apply(combinations, 2)) # All pairs pairs = list(chain.from_iterable(combos)) # Flatten list of lists print(f"Found {len(pairs)} potential pairs") self.pairs = pd.Series(pairs) self.cluster_labels_ = clustering.labels_
def __init__(self, algorithm: str, n_clusters: int = 5, verbose=False): """ Initialize the classifier :param algorithm: The name of the clustering algorithm :param n_c lusters: Number of clusters. Ignored for density based algorithms :param verbose: Print more... """ # Store the file path of the training data self.data = None self.verbose = verbose if algorithm == "KMeans": self.sklearn_clustering = KMeans(verbose=verbose, n_clusters=n_clusters) elif algorithm.startswith("AgglomerativeClustering"): algo, linkage_method = algorithm.split("_") self.sklearn_clustering = AgglomerativeClustering( linkage=linkage_method, n_clusters=n_clusters) elif algorithm == "OPTICS": self.sklearn_clustering = OPTICS(min_samples=5) else: raise Exception( "Unsupported clustering type {0}. Use one of {1}".format( algorithm, self.supported_algos)) self.algorithm = algorithm self.count_vectorizer = None self.tfidf_transformer = None
def make_autoencoder(data, lr=0.001, enc_dim=100): # Auto encoder layers ae0 = Input(shape=products_shape, name='FeaturesInput') encode = Dense(enc_dim, activation='relu', kernel_initializer=he_normal(1), name='AE_feature_reduction')(ae0) decode = Dense(products_shape[0], activation='relu', name='AE_3')(encode) # inspired by https://www.frontiersin.org/articles/10.3389/fgene.2018.00585/full # clustering layers (will work with the help of OPTICS) # we want to find the probability of one product to be in 1 of total found clusters opt = OPTICS() opt.fit(minmax.fit_transform(data)) clusters = len(np.unique(opt.labels_)) print('Optimal number of cluster:', clusters) prob0 = Dense(enc_dim // 2, activation='relu', kernel_initializer=he_normal(1))(encode) prob1 = BatchNormalization()(prob0) prob = Dense(clusters, activation='softmax', name='Probability_Product')(prob1) autoencoder_ = Model(inputs=ae0, outputs=decode) encoder_ = Model(inputs=ae0, outputs=encode) p_prob = Model(inputs=ae0, outputs=prob) autoencoder_.compile(optimizer=Adam(learning_rate=lr), loss='mae', metrics=['mse']) return autoencoder_, encoder_, p_prob, opt
def exploratory_analysis(dataset: str, samples=0.1, eps=np.inf) -> None: X = np.genfromtxt(dataset, delimiter=',', encoding='utf8') scaler = StandardScaler(copy=False) X_transformed = scaler.fit_transform(X) clust = OPTICS(min_samples=samples, max_eps=eps, n_jobs=2) labels = clust.fit_predict(X) n_clusters = len(set(labels)) print("# clusters: {0}".format(n_clusters))
def cluster_embedded_maps_optics(aligned_maps): # embeding = embed(aligned_maps) embedding = np.vstack([xmap.flatten() for xmap in aligned_maps]) clusterer = OPTICS() clusterer.fit(embedding.astype(np.float64)) return clusterer.labels_
def create_clusters(self, min_samples): optics = OPTICS(min_samples=min_samples) clustering = optics.fit(self.performance_features) len(clustering.labels_[clustering.labels_ == -1]) / len( clustering.labels_) classified = pd.Series(clustering.labels_, index=self.performance.columns) self._clusters = classified self._create_cluster_based_pairs()
import matplotlib.pyplot as plt # Generate sample data np.random.seed(0) n_points_per_cluster = 250 C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) clust = OPTICS(min_samples=9, rejection_ratio=0.5) # Run the fit clust.fit(X) _, labels_025 = clust.extract_dbscan(0.25) _, labels_075 = clust.extract_dbscan(0.75) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_] plt.figure(figsize=(10, 7)) G = gridspec.GridSpec(2, 3) ax1 = plt.subplot(G[0, :]) ax2 = plt.subplot(G[1, 0])
import numpy as np # Generate sample data np.random.seed(0) n_points_per_cluster = 250 C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # Run the fit clust.fit(X) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_]