示例#1
0
 def _get_sorted_db_keypoint_distances(self, N=None):
     """Use a minimum spanning tree heuristic to find the N largest gaps in the
     line constituted by the current decision boundary keypoints.
     """
     if N == None:
         N = self.n_interpolated_keypoints
     edges = minimum_spanning_tree(squareform(pdist(self.decision_boundary_points_2d)))
     edged = np.array([euclidean(self.decision_boundary_points_2d[u],
                                 self.decision_boundary_points_2d[v]) for u, v in edges])
     gap_edge_idx = np.argsort(edged)[::-1][:N]
     edges = edges[gap_edge_idx]
     gap_distances = np.square(edged[gap_edge_idx])
     gap_probability_scores = gap_distances / np.sum(gap_distances)
     return edges, gap_distances, gap_probability_scores
示例#2
0
	def _init_clusters(self):
		sub_docs = self.docs[:self.num_instances]
		sim_mat = utils.pairwise(sub_docs, 
			lambda x, y: max(self.doc_similarity(x, y), self.doc_similarity(y, x)))

		edges = utils.minimum_spanning_tree(sim_mat)
		ccs = utils.get_ccs(range(self.num_instances), edges) 
		biggest_cc = max(map(len, ccs))
		while biggest_cc > self.num_init:
			edge_to_remove = random.sample(edges, 1)[0]
			edges.remove(edge_to_remove)
			ccs = utils.get_ccs(range(self.num_instances), edges)
			biggest_cc = max(map(len, ccs))
		cc = ccs[utils.argmax(map(len, ccs))]

		for idx in cc:
			self._add_cluster(self.docs[idx], member=False)
示例#3
0
    def _init_clusters(self):
        sub_docs = self.docs[:self.num_instances]
        sim_mat = utils.pairwise(
            sub_docs, lambda x, y: max(self.doc_similarity(x, y),
                                       self.doc_similarity(y, x)))

        edges = utils.minimum_spanning_tree(sim_mat)
        ccs = utils.get_ccs(range(self.num_instances), edges)
        biggest_cc = max(map(len, ccs))
        while biggest_cc > self.num_init:
            edge_to_remove = random.sample(edges, 1)[0]
            edges.remove(edge_to_remove)
            ccs = utils.get_ccs(range(self.num_instances), edges)
            biggest_cc = max(map(len, ccs))
        cc = ccs[utils.argmax(map(len, ccs))]

        for idx in cc:
            self._add_cluster(self.docs[idx], member=False)
    def plot(
        self,
        plt=None,
        generate_testpoints=True,
        generate_background=True,
        tune_background_model=False,
        background_resolution=100,
        scatter_size_scale=1.0,
        legend=True,
    ):
        """Plots the dataset and the identified decision boundary in 2D.
        (If you wish to create custom plots, get the data using generate_plot() and plot it manually)

        Parameters
        ----------
        plt : matplotlib.pyplot or axis object (default=matplotlib.pyplot)
            Object to be plotted on

        generate_testpoints : boolean, optional (default=True)
            Whether to generate demo points around the estimated decision boundary
            as a sanity check

        generate_background : boolean, optional (default=True)
            Whether to generate faint background plot (using prediction probabilities
            of a fitted suppor vector machine, trained on generated demo points)
            to aid visualization

        tune_background_model : boolean, optional (default=False)
            Whether to tune the parameters of the support vector machine generating
            the background

        background_resolution : int, optional (default=100)
            Desired resolution (height and width) of background to be generated

        scatter_size_scale : float, optional (default=1.0)
            Scaling factor for scatter plot marker size

        legend : boolean, optional (default=False)
            Whether to display a legend

        Returns
        -------
        plt : The matplotlib.pyplot or axis object which has been passed in, after
        plotting the data and decision boundary on it. (plt.show() is NOT called
        and will be required)
        """
        if plt == None:
            plt = mplt

        if len(self.X_testpoints) == 0:
            self.generate_plot(
                generate_testpoints=generate_testpoints,
                generate_background=generate_background,
                tune_background_model=tune_background_model,
                background_resolution=background_resolution,
            )

        if generate_background and generate_testpoints:
            try:
                plt.imshow(
                    np.flipud(self.background),
                    extent=[
                        self.X2d_xmin, self.X2d_xmax, self.X2d_ymin,
                        self.X2d_ymax
                    ],
                    cmap="GnBu",
                    alpha=0.33,
                )
            except (Exception, ex):
                print("Failed to render image background")

        # decision boundary
        plt.scatter(
            self.decision_boundary_points_2d[:, 0],
            self.decision_boundary_points_2d[:, 1],
            600 * scatter_size_scale,
            c="c",
            marker="p",
        )
        # generated demo points
        if generate_testpoints:
            plt.scatter(
                self.X_testpoints_2d[:, 0],
                self.X_testpoints_2d[:, 1],
                20 * scatter_size_scale,
                c=["g" if i else "b" for i in self.y_testpoints],
                alpha=0.6,
            )

        # training data
        plt.scatter(
            self.X2d[self.train_idx, 0],
            self.X2d[self.train_idx, 1],
            40 * scatter_size_scale,
            facecolor=["g" if i else "b" for i in self.y[self.train_idx]],
            edgecolor=[
                "g" if self.y_pred[self.train_idx[i]] ==
                self.y[self.train_idx[i]] == 1 else
                ("b" if self.y_pred[self.train_idx[i]] ==
                 self.y[self.train_idx[i]] == 0 else "r")
                for i in range(len(self.train_idx))
            ],
            linewidths=5 * scatter_size_scale,
        )
        # testing data
        plt.scatter(
            self.X2d[self.test_idx, 0],
            self.X2d[self.test_idx, 1],
            150 * scatter_size_scale,
            facecolor=["g" if i else "b" for i in self.y[self.test_idx]],
            edgecolor=[
                "g" if
                self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 1
                else ("b" if self.y_pred[self.test_idx[i]] ==
                      self.y[self.test_idx[i]] == 0 else "r")
                for i in range(len(self.test_idx))
            ],
            linewidths=5 * scatter_size_scale,
            marker="s",
        )

        # label data points with their indices
        for i in range(len(self.X2d)):
            plt.text(
                self.X2d[i, 0] + (self.X2d_xmax - self.X2d_xmin) * 0.5e-2,
                self.X2d[i, 1] + (self.X2d_ymax - self.X2d_ymin) * 0.5e-2,
                str(i),
                size=8,
            )

        if legend:
            plt.legend(
                [
                    "Estimated decision boundary keypoints",
                    "Generated demo data around decision boundary",
                    "Actual data (training set)",
                    "Actual data (demo set)",
                ],
                loc="lower right",
                prop={"size": 9},
            )

        # decision boundary keypoints, in case not visible in background
        plt.scatter(
            self.decision_boundary_points_2d[:, 0],
            self.decision_boundary_points_2d[:, 1],
            600 * scatter_size_scale,
            c="c",
            marker="p",
            alpha=0.1,
        )
        plt.scatter(
            self.decision_boundary_points_2d[:, 0],
            self.decision_boundary_points_2d[:, 1],
            30 * scatter_size_scale,
            c="c",
            marker="p",
            edgecolor="c",
            alpha=0.8,
        )

        # minimum spanning tree through decision boundary keypoints
        D = pdist(self.decision_boundary_points_2d)
        edges = minimum_spanning_tree(squareform(D))
        for e in edges:
            plt.plot(
                [
                    self.decision_boundary_points_2d[e[0], 0],
                    self.decision_boundary_points_2d[e[1], 0],
                ],
                [
                    self.decision_boundary_points_2d[e[0], 1],
                    self.decision_boundary_points_2d[e[1], 1],
                ],
                "--c",
                linewidth=4 * scatter_size_scale,
            )
            plt.plot(
                [
                    self.decision_boundary_points_2d[e[0], 0],
                    self.decision_boundary_points_2d[e[1], 0],
                ],
                [
                    self.decision_boundary_points_2d[e[0], 1],
                    self.decision_boundary_points_2d[e[1], 1],
                ],
                "--k",
                linewidth=1,
            )

        if len(self.test_idx) == 0:
            print(
                "No demo performance calculated, as no testing data was specified"
            )
        else:
            freq = np.array(
                np.unique(self.y[self.test_idx],
                          return_counts=True)).T.astype(float)
            imbalance = np.round(
                np.max((freq[0, 1], freq[1, 1])) / len(self.test_idx), 3)
            acc_score = np.round(
                accuracy_score(self.y[self.test_idx],
                               self.y_pred[self.test_idx]), 3)
            f1 = np.round(
                f1_score(self.y[self.test_idx], self.y_pred[self.test_idx]), 3)
            plt.title("Test accuracy: " + str(acc_score) + ", F1 score: " +
                      str(f1) + ". Imbalance (max chance accuracy): " +
                      str(imbalance))

        if self.verbose:
            print(
                "Plot successfully generated! Don't forget to call the show() method to display it"
            )

        return plt
    def plot(self, plt=None, generate_testpoints=True, generate_background=True, tune_background_model=False, background_resolution=100, scatter_size_scale=1.0, legend=True):
        """Plots the dataset and the identified decision boundary in 2D.
        (If you wish to create custom plots, get the data using generate_plot() and plot it manually)

        Parameters
        ----------
        plt : matplotlib.pyplot or axis object (default=matplotlib.pyplot)
            Object to be plotted on

        generate_testpoints : boolean, optional (default=True)
            Whether to generate demo points around the estimated decision boundary
            as a sanity check

        generate_background : boolean, optional (default=True)
            Whether to generate faint background plot (using prediction probabilities
            of a fitted suppor vector machine, trained on generated demo points)
            to aid visualization

        tune_background_model : boolean, optional (default=False)
            Whether to tune the parameters of the support vector machine generating
            the background

        background_resolution : int, optional (default=100)
            Desired resolution (height and width) of background to be generated

        scatter_size_scale : float, optional (default=1.0)
            Scaling factor for scatter plot marker size

        legend : boolean, optional (default=False)
            Whether to display a legend

        Returns
        -------
        plt : The matplotlib.pyplot or axis object which has been passed in, after
        plotting the data and decision boundary on it. (plt.show() is NOT called
        and will be required)
        """
        if plt == None:
            plt = mplt

        if len(self.X_testpoints) == 0:
            self.generate_plot(generate_testpoints=generate_testpoints, generate_background=generate_background,
                               tune_background_model=tune_background_model, background_resolution=background_resolution)

        if generate_background and generate_testpoints:
            try:
                plt.imshow(np.flipud(self.background), extent=[
                           self.X2d_xmin, self.X2d_xmax, self.X2d_ymin, self.X2d_ymax], cmap="GnBu", alpha=0.33)
            except (Exception, ex):
                print("Failed to render image background")

        # decision boundary
        plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[
                    :, 1], 600 * scatter_size_scale, c='c', marker='p')
        # generated demo points
        if generate_testpoints:
            plt.scatter(self.X_testpoints_2d[:, 0], self.X_testpoints_2d[
                        :, 1], 20 * scatter_size_scale, c=['g' if i else 'b' for i in self.y_testpoints], alpha=0.6)

        # training data
        plt.scatter(self.X2d[self.train_idx, 0], self.X2d[self.train_idx, 1], 150 * scatter_size_scale,
                    facecolor=['g' if i else 'b' for i in self.y[self.train_idx]],
                    edgecolor=['g' if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 1
                               else ('b' if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 0 else 'r')
                               for i in range(len(self.train_idx))], linewidths=5 * scatter_size_scale)
        # testing data
        plt.scatter(self.X2d[self.test_idx, 0], self.X2d[self.test_idx, 1], 150 * scatter_size_scale,
                    facecolor=['g' if i else 'b' for i in self.y[self.test_idx]],
                    edgecolor=['g' if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 1
                               else ('b' if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 0 else 'r')
                               for i in range(len(self.test_idx))], linewidths=5 * scatter_size_scale, marker='s')

        # label data points with their indices
        for i in range(len(self.X2d)):
            plt.text(self.X2d[i, 0] + (self.X2d_xmax - self.X2d_xmin) * 0.5e-2,
                     self.X2d[i, 1] + (self.X2d_ymax - self.X2d_ymin) * 0.5e-2, str(i), size=8)

        if legend:
            plt.legend(["Estimated decision boundary keypoints", "Generated demo data around decision boundary",
                        "Actual data (training set)", "Actual data (demo set)"], loc="lower right", prop={'size': 9})

        # decision boundary keypoints, in case not visible in background
        plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1],
                    600 * scatter_size_scale, c='c', marker='p', alpha=0.1)
        plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1],
                    30 * scatter_size_scale, c='c', marker='p', edgecolor='c', alpha=0.8)

        # minimum spanning tree through decision boundary keypoints
        D = pdist(self.decision_boundary_points_2d)
        edges = minimum_spanning_tree(squareform(D))
        for e in edges:
            plt.plot([self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0]],
                     [self.decision_boundary_points_2d[e[0], 1],
                         self.decision_boundary_points_2d[e[1], 1]],
                     '--c', linewidth=4 * scatter_size_scale)
            plt.plot([self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0]],
                     [self.decision_boundary_points_2d[e[0], 1],
                         self.decision_boundary_points_2d[e[1], 1]],
                     '--k', linewidth=1)

        if len(self.test_idx) == 0:
            print("No demo performance calculated, as no testing data was specified")
        else:
            freq = itemfreq(self.y[self.test_idx]).astype(float)
            imbalance = np.round(np.max((freq[0, 1], freq[1, 1])) / len(self.test_idx), 3)
            acc_score = np.round(accuracy_score(
                self.y[self.test_idx], self.y_pred[self.test_idx]), 3)
            f1 = np.round(f1_score(self.y[self.test_idx], self.y_pred[self.test_idx]), 3)
            plt.title("Test accuracy: " + str(acc_score) + ", F1 score: " +
                      str(f1) + ". Imbalance (max chance accuracy): " + str(imbalance))

        if self.verbose:
            print("Plot successfully generated! Don't forget to call the show() method to display it")

        return plt