def dbscan_score_points(self): """ Scoring function for the dbscan method :return: A list of scores corresponding to the points """ positive_assignment = learning.dbscan(self.positive_data, self.eps[0], self.min_samples[0]) negative_assignment = learning.dbscan(self.negative_data, self.eps[1], self.min_samples[1]) # resort to k-means if things go poorly... if max(positive_assignment) < 2: logger.warning("Clustering positive with k-means instead...") positive_assignment = learning.kmeans(self.positive_data, self.k_clusters_positive) if max(negative_assignment) < 2: logger.warning("Clustering negative with k-means instead...") negative_assignment = learning.kmeans(self.negative_data, self.k_clusters_negative) positive_centroids = learning.get_centroids(self.positive_data, positive_assignment) negative_centroids = learning.get_centroids(self.negative_data, negative_assignment) scores = [0] * self.num_points for i in xrange(self.num_points): point = self.data_points[i] closest_positive = learning.closest_to(point, positive_centroids) closest_negative = learning.closest_to(point, negative_centroids) scores[i] = [self.proximity_metric(point, closest_positive, closest_negative)] return scores
def silhouette_score_points(self): """ Scoring function for the silhouette method :return: A list of scores corresponding to the points """ positive_appended = np.append(self.positive_data, self.data_points, axis=0) negative_appended = np.append(self.positive_data, self.data_points, axis=0) positive_assignment = learning.kmeans(positive_appended, 86) negative_assignment = learning.kmeans(negative_appended, 86) pos_sils = learning.silhouettes(positive_appended, positive_assignment) neg_sils = learning.silhouettes(negative_appended, negative_assignment) scores = np.array(pos_sils[-self.num_points:] - neg_sils[-self.num_points:]) return scores
def kmeans_score_points(self): """ Scoring function for the kmeans method :return: A list of scores corresponding to the points """ positive_assignment = learning.kmeans(self.positive_data, self.k_clusters) negative_assignment = learning.kmeans(self.negative_data, self.k_clusters) positive_centroids = learning.get_centroids(self.positive_data, positive_assignment) negative_centroids = learning.get_centroids(self.negative_data, negative_assignment) scores = np.zeros(self.num_points) for i in xrange(self.num_points): point = self.data_points[i] closest_positive = learning.closest_to(point, positive_centroids) closest_negative = learning.closest_to(point, negative_centroids) scores[i] = self.proximity_metric(point, closest_positive, closest_negative) return scores
def get_assignment(self): """ This function is for getting the cluster assignment of the data :return: Whatever data is returned by the dbscan wrapper function in learning.py """ if self.dbscan: if self.cluster_on_tsne: self.assignment = learning.dbscan( self.tsne_data, eps=self.eps, min_samples=self.min_samples, sort_by_size=self.order_clusters_by_size) else: self.assignment = learning.dbscan( self.features, eps=self.eps, min_samples=self.min_samples, sort_by_size=self.order_clusters_by_size) elif self.kmeans: if self.cluster_on_tsne: self.assignment = learning.kmeans( self.tsne_data, self.k_clusters, sort_by_size=self.order_clusters_by_size) else: self.assignment = learning.kmeans( self.features, self.k_clusters, sort_by_size=self.order_clusters_by_size) else: self.assignment = None self.num_clusters = len(set(self.assignment) - set([-1])) if self.num_clusters > 0: logger.debug("Number of clusters: %d" % self.num_clusters) else: logger.warning("Data was assigned to zero clusters. Exiting.") exit() return self.assignment
ids, data = fileIO.read_feature_file(args.features_file, normalize=True) k_clusters = np.arange(10, 600, 10) k_clusters = np.array(sorted(list(set(k_clusters)))) sil_scores = np.zeros(k_clusters.shape) sil_score_std = np.zeros(k_clusters.shape) num_repeats = 5 for i in xrange(k_clusters.shape[0]): k = k_clusters[i] means = np.zeros(num_repeats) for j in xrange(num_repeats): sils = learning.silhouette_score( data, learning.kmeans(data, k, verbose=True)) means[j] = np.mean(sils) sil_scores[i] = np.mean(means) sil_score_std[i] = np.std(means) fig = plt.figure() ax = plt.subplot(111) ax.plot(k_clusters, sil_scores) ax.errorbar(k_clusters, sil_scores, yerr=sil_score_std) ax.set_xlabel("Number of Clusters") ax.set_ylabel("Silhouette") ax.set_title(args.features_file) if args.output_file is None: filename = "%s_sil.svg" % os.path.splitext(