예제 #1
0
파일: phamer.py 프로젝트: neptuneyt/PhaMers
    def dbscan_score_points(self):
        """
        Scoring function for the dbscan method
        :return: A list of scores corresponding to the points
        """
        positive_assignment = learning.dbscan(self.positive_data, self.eps[0], self.min_samples[0])
        negative_assignment = learning.dbscan(self.negative_data, self.eps[1], self.min_samples[1])

        # resort to k-means if things go poorly...
        if max(positive_assignment) < 2:
            logger.warning("Clustering positive with k-means instead...")
            positive_assignment = learning.kmeans(self.positive_data, self.k_clusters_positive)
        if max(negative_assignment) < 2:
            logger.warning("Clustering negative with k-means instead...")
            negative_assignment = learning.kmeans(self.negative_data, self.k_clusters_negative)

        positive_centroids = learning.get_centroids(self.positive_data, positive_assignment)
        negative_centroids = learning.get_centroids(self.negative_data, negative_assignment)

        scores = [0] * self.num_points
        for i in xrange(self.num_points):
            point = self.data_points[i]
            closest_positive = learning.closest_to(point, positive_centroids)
            closest_negative = learning.closest_to(point, negative_centroids)
            scores[i] = [self.proximity_metric(point, closest_positive, closest_negative)]

        return scores
예제 #2
0
파일: phamer.py 프로젝트: neptuneyt/PhaMers
 def silhouette_score_points(self):
     """
     Scoring function for the silhouette method
     :return: A list of scores corresponding to the points
     """
     positive_appended = np.append(self.positive_data, self.data_points, axis=0)
     negative_appended = np.append(self.positive_data, self.data_points, axis=0)
     positive_assignment = learning.kmeans(positive_appended, 86)
     negative_assignment = learning.kmeans(negative_appended, 86)
     pos_sils = learning.silhouettes(positive_appended, positive_assignment)
     neg_sils = learning.silhouettes(negative_appended, negative_assignment)
     scores = np.array(pos_sils[-self.num_points:] - neg_sils[-self.num_points:])
     return scores
예제 #3
0
파일: phamer.py 프로젝트: neptuneyt/PhaMers
    def kmeans_score_points(self):
        """
        Scoring function for the kmeans method
        :return: A list of scores corresponding to the points
        """
        positive_assignment = learning.kmeans(self.positive_data, self.k_clusters)
        negative_assignment = learning.kmeans(self.negative_data, self.k_clusters)
        positive_centroids = learning.get_centroids(self.positive_data, positive_assignment)
        negative_centroids = learning.get_centroids(self.negative_data, negative_assignment)

        scores = np.zeros(self.num_points)
        for i in xrange(self.num_points):
            point = self.data_points[i]
            closest_positive = learning.closest_to(point, positive_centroids)
            closest_negative = learning.closest_to(point, negative_centroids)
            scores[i] = self.proximity_metric(point, closest_positive, closest_negative)
        return scores
예제 #4
0
    def get_assignment(self):
        """
        This function is for getting the cluster assignment of the data
        :return: Whatever data is returned by the dbscan wrapper function in learning.py
        """
        if self.dbscan:
            if self.cluster_on_tsne:
                self.assignment = learning.dbscan(
                    self.tsne_data,
                    eps=self.eps,
                    min_samples=self.min_samples,
                    sort_by_size=self.order_clusters_by_size)
            else:
                self.assignment = learning.dbscan(
                    self.features,
                    eps=self.eps,
                    min_samples=self.min_samples,
                    sort_by_size=self.order_clusters_by_size)
        elif self.kmeans:
            if self.cluster_on_tsne:
                self.assignment = learning.kmeans(
                    self.tsne_data,
                    self.k_clusters,
                    sort_by_size=self.order_clusters_by_size)
            else:
                self.assignment = learning.kmeans(
                    self.features,
                    self.k_clusters,
                    sort_by_size=self.order_clusters_by_size)
        else:
            self.assignment = None

        self.num_clusters = len(set(self.assignment) - set([-1]))
        if self.num_clusters > 0:
            logger.debug("Number of clusters: %d" % self.num_clusters)
        else:
            logger.warning("Data was assigned to zero clusters. Exiting.")
            exit()
        return self.assignment
예제 #5
0
    ids, data = fileIO.read_feature_file(args.features_file, normalize=True)

    k_clusters = np.arange(10, 600, 10)
    k_clusters = np.array(sorted(list(set(k_clusters))))
    sil_scores = np.zeros(k_clusters.shape)
    sil_score_std = np.zeros(k_clusters.shape)

    num_repeats = 5

    for i in xrange(k_clusters.shape[0]):
        k = k_clusters[i]

        means = np.zeros(num_repeats)
        for j in xrange(num_repeats):
            sils = learning.silhouette_score(
                data, learning.kmeans(data, k, verbose=True))
            means[j] = np.mean(sils)

        sil_scores[i] = np.mean(means)
        sil_score_std[i] = np.std(means)

    fig = plt.figure()
    ax = plt.subplot(111)
    ax.plot(k_clusters, sil_scores)
    ax.errorbar(k_clusters, sil_scores, yerr=sil_score_std)
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Silhouette")
    ax.set_title(args.features_file)

    if args.output_file is None:
        filename = "%s_sil.svg" % os.path.splitext(