def _get_sorted_db_keypoint_distances(self, N=None): """Use a minimum spanning tree heuristic to find the N largest gaps in the line constituted by the current decision boundary keypoints. """ if N == None: N = self.n_interpolated_keypoints edges = minimum_spanning_tree(squareform(pdist(self.decision_boundary_points_2d))) edged = np.array([euclidean(self.decision_boundary_points_2d[u], self.decision_boundary_points_2d[v]) for u, v in edges]) gap_edge_idx = np.argsort(edged)[::-1][:N] edges = edges[gap_edge_idx] gap_distances = np.square(edged[gap_edge_idx]) gap_probability_scores = gap_distances / np.sum(gap_distances) return edges, gap_distances, gap_probability_scores
def _init_clusters(self): sub_docs = self.docs[:self.num_instances] sim_mat = utils.pairwise(sub_docs, lambda x, y: max(self.doc_similarity(x, y), self.doc_similarity(y, x))) edges = utils.minimum_spanning_tree(sim_mat) ccs = utils.get_ccs(range(self.num_instances), edges) biggest_cc = max(map(len, ccs)) while biggest_cc > self.num_init: edge_to_remove = random.sample(edges, 1)[0] edges.remove(edge_to_remove) ccs = utils.get_ccs(range(self.num_instances), edges) biggest_cc = max(map(len, ccs)) cc = ccs[utils.argmax(map(len, ccs))] for idx in cc: self._add_cluster(self.docs[idx], member=False)
def _init_clusters(self): sub_docs = self.docs[:self.num_instances] sim_mat = utils.pairwise( sub_docs, lambda x, y: max(self.doc_similarity(x, y), self.doc_similarity(y, x))) edges = utils.minimum_spanning_tree(sim_mat) ccs = utils.get_ccs(range(self.num_instances), edges) biggest_cc = max(map(len, ccs)) while biggest_cc > self.num_init: edge_to_remove = random.sample(edges, 1)[0] edges.remove(edge_to_remove) ccs = utils.get_ccs(range(self.num_instances), edges) biggest_cc = max(map(len, ccs)) cc = ccs[utils.argmax(map(len, ccs))] for idx in cc: self._add_cluster(self.docs[idx], member=False)
def plot( self, plt=None, generate_testpoints=True, generate_background=True, tune_background_model=False, background_resolution=100, scatter_size_scale=1.0, legend=True, ): """Plots the dataset and the identified decision boundary in 2D. (If you wish to create custom plots, get the data using generate_plot() and plot it manually) Parameters ---------- plt : matplotlib.pyplot or axis object (default=matplotlib.pyplot) Object to be plotted on generate_testpoints : boolean, optional (default=True) Whether to generate demo points around the estimated decision boundary as a sanity check generate_background : boolean, optional (default=True) Whether to generate faint background plot (using prediction probabilities of a fitted suppor vector machine, trained on generated demo points) to aid visualization tune_background_model : boolean, optional (default=False) Whether to tune the parameters of the support vector machine generating the background background_resolution : int, optional (default=100) Desired resolution (height and width) of background to be generated scatter_size_scale : float, optional (default=1.0) Scaling factor for scatter plot marker size legend : boolean, optional (default=False) Whether to display a legend Returns ------- plt : The matplotlib.pyplot or axis object which has been passed in, after plotting the data and decision boundary on it. (plt.show() is NOT called and will be required) """ if plt == None: plt = mplt if len(self.X_testpoints) == 0: self.generate_plot( generate_testpoints=generate_testpoints, generate_background=generate_background, tune_background_model=tune_background_model, background_resolution=background_resolution, ) if generate_background and generate_testpoints: try: plt.imshow( np.flipud(self.background), extent=[ self.X2d_xmin, self.X2d_xmax, self.X2d_ymin, self.X2d_ymax ], cmap="GnBu", alpha=0.33, ) except (Exception, ex): print("Failed to render image background") # decision boundary plt.scatter( self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 600 * scatter_size_scale, c="c", marker="p", ) # generated demo points if generate_testpoints: plt.scatter( self.X_testpoints_2d[:, 0], self.X_testpoints_2d[:, 1], 20 * scatter_size_scale, c=["g" if i else "b" for i in self.y_testpoints], alpha=0.6, ) # training data plt.scatter( self.X2d[self.train_idx, 0], self.X2d[self.train_idx, 1], 40 * scatter_size_scale, facecolor=["g" if i else "b" for i in self.y[self.train_idx]], edgecolor=[ "g" if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 1 else ("b" if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 0 else "r") for i in range(len(self.train_idx)) ], linewidths=5 * scatter_size_scale, ) # testing data plt.scatter( self.X2d[self.test_idx, 0], self.X2d[self.test_idx, 1], 150 * scatter_size_scale, facecolor=["g" if i else "b" for i in self.y[self.test_idx]], edgecolor=[ "g" if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 1 else ("b" if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 0 else "r") for i in range(len(self.test_idx)) ], linewidths=5 * scatter_size_scale, marker="s", ) # label data points with their indices for i in range(len(self.X2d)): plt.text( self.X2d[i, 0] + (self.X2d_xmax - self.X2d_xmin) * 0.5e-2, self.X2d[i, 1] + (self.X2d_ymax - self.X2d_ymin) * 0.5e-2, str(i), size=8, ) if legend: plt.legend( [ "Estimated decision boundary keypoints", "Generated demo data around decision boundary", "Actual data (training set)", "Actual data (demo set)", ], loc="lower right", prop={"size": 9}, ) # decision boundary keypoints, in case not visible in background plt.scatter( self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 600 * scatter_size_scale, c="c", marker="p", alpha=0.1, ) plt.scatter( self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 30 * scatter_size_scale, c="c", marker="p", edgecolor="c", alpha=0.8, ) # minimum spanning tree through decision boundary keypoints D = pdist(self.decision_boundary_points_2d) edges = minimum_spanning_tree(squareform(D)) for e in edges: plt.plot( [ self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0], ], [ self.decision_boundary_points_2d[e[0], 1], self.decision_boundary_points_2d[e[1], 1], ], "--c", linewidth=4 * scatter_size_scale, ) plt.plot( [ self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0], ], [ self.decision_boundary_points_2d[e[0], 1], self.decision_boundary_points_2d[e[1], 1], ], "--k", linewidth=1, ) if len(self.test_idx) == 0: print( "No demo performance calculated, as no testing data was specified" ) else: freq = np.array( np.unique(self.y[self.test_idx], return_counts=True)).T.astype(float) imbalance = np.round( np.max((freq[0, 1], freq[1, 1])) / len(self.test_idx), 3) acc_score = np.round( accuracy_score(self.y[self.test_idx], self.y_pred[self.test_idx]), 3) f1 = np.round( f1_score(self.y[self.test_idx], self.y_pred[self.test_idx]), 3) plt.title("Test accuracy: " + str(acc_score) + ", F1 score: " + str(f1) + ". Imbalance (max chance accuracy): " + str(imbalance)) if self.verbose: print( "Plot successfully generated! Don't forget to call the show() method to display it" ) return plt
def plot(self, plt=None, generate_testpoints=True, generate_background=True, tune_background_model=False, background_resolution=100, scatter_size_scale=1.0, legend=True): """Plots the dataset and the identified decision boundary in 2D. (If you wish to create custom plots, get the data using generate_plot() and plot it manually) Parameters ---------- plt : matplotlib.pyplot or axis object (default=matplotlib.pyplot) Object to be plotted on generate_testpoints : boolean, optional (default=True) Whether to generate demo points around the estimated decision boundary as a sanity check generate_background : boolean, optional (default=True) Whether to generate faint background plot (using prediction probabilities of a fitted suppor vector machine, trained on generated demo points) to aid visualization tune_background_model : boolean, optional (default=False) Whether to tune the parameters of the support vector machine generating the background background_resolution : int, optional (default=100) Desired resolution (height and width) of background to be generated scatter_size_scale : float, optional (default=1.0) Scaling factor for scatter plot marker size legend : boolean, optional (default=False) Whether to display a legend Returns ------- plt : The matplotlib.pyplot or axis object which has been passed in, after plotting the data and decision boundary on it. (plt.show() is NOT called and will be required) """ if plt == None: plt = mplt if len(self.X_testpoints) == 0: self.generate_plot(generate_testpoints=generate_testpoints, generate_background=generate_background, tune_background_model=tune_background_model, background_resolution=background_resolution) if generate_background and generate_testpoints: try: plt.imshow(np.flipud(self.background), extent=[ self.X2d_xmin, self.X2d_xmax, self.X2d_ymin, self.X2d_ymax], cmap="GnBu", alpha=0.33) except (Exception, ex): print("Failed to render image background") # decision boundary plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[ :, 1], 600 * scatter_size_scale, c='c', marker='p') # generated demo points if generate_testpoints: plt.scatter(self.X_testpoints_2d[:, 0], self.X_testpoints_2d[ :, 1], 20 * scatter_size_scale, c=['g' if i else 'b' for i in self.y_testpoints], alpha=0.6) # training data plt.scatter(self.X2d[self.train_idx, 0], self.X2d[self.train_idx, 1], 150 * scatter_size_scale, facecolor=['g' if i else 'b' for i in self.y[self.train_idx]], edgecolor=['g' if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 1 else ('b' if self.y_pred[self.train_idx[i]] == self.y[self.train_idx[i]] == 0 else 'r') for i in range(len(self.train_idx))], linewidths=5 * scatter_size_scale) # testing data plt.scatter(self.X2d[self.test_idx, 0], self.X2d[self.test_idx, 1], 150 * scatter_size_scale, facecolor=['g' if i else 'b' for i in self.y[self.test_idx]], edgecolor=['g' if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 1 else ('b' if self.y_pred[self.test_idx[i]] == self.y[self.test_idx[i]] == 0 else 'r') for i in range(len(self.test_idx))], linewidths=5 * scatter_size_scale, marker='s') # label data points with their indices for i in range(len(self.X2d)): plt.text(self.X2d[i, 0] + (self.X2d_xmax - self.X2d_xmin) * 0.5e-2, self.X2d[i, 1] + (self.X2d_ymax - self.X2d_ymin) * 0.5e-2, str(i), size=8) if legend: plt.legend(["Estimated decision boundary keypoints", "Generated demo data around decision boundary", "Actual data (training set)", "Actual data (demo set)"], loc="lower right", prop={'size': 9}) # decision boundary keypoints, in case not visible in background plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 600 * scatter_size_scale, c='c', marker='p', alpha=0.1) plt.scatter(self.decision_boundary_points_2d[:, 0], self.decision_boundary_points_2d[:, 1], 30 * scatter_size_scale, c='c', marker='p', edgecolor='c', alpha=0.8) # minimum spanning tree through decision boundary keypoints D = pdist(self.decision_boundary_points_2d) edges = minimum_spanning_tree(squareform(D)) for e in edges: plt.plot([self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0]], [self.decision_boundary_points_2d[e[0], 1], self.decision_boundary_points_2d[e[1], 1]], '--c', linewidth=4 * scatter_size_scale) plt.plot([self.decision_boundary_points_2d[e[0], 0], self.decision_boundary_points_2d[e[1], 0]], [self.decision_boundary_points_2d[e[0], 1], self.decision_boundary_points_2d[e[1], 1]], '--k', linewidth=1) if len(self.test_idx) == 0: print("No demo performance calculated, as no testing data was specified") else: freq = itemfreq(self.y[self.test_idx]).astype(float) imbalance = np.round(np.max((freq[0, 1], freq[1, 1])) / len(self.test_idx), 3) acc_score = np.round(accuracy_score( self.y[self.test_idx], self.y_pred[self.test_idx]), 3) f1 = np.round(f1_score(self.y[self.test_idx], self.y_pred[self.test_idx]), 3) plt.title("Test accuracy: " + str(acc_score) + ", F1 score: " + str(f1) + ". Imbalance (max chance accuracy): " + str(imbalance)) if self.verbose: print("Plot successfully generated! Don't forget to call the show() method to display it") return plt