def plot_knn(ds: loompy.LoomConnection, out_file: str) -> None: n_cells = ds.shape[1] valid = ds.col_attrs["_Valid"].astype('bool') (a, b, w) = ds.get_edges("MKNN", axis=1) mknn = sparse.coo_matrix( (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid] xy = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :] fig = plt.figure(figsize=(10, 10)) g = nx.from_scipy_sparse_matrix(mknn) ax = fig.add_subplot(111) nx.draw_networkx_edges(g, pos=xy, alpha=0.25, width=0.2, edge_color='gray') ax.axis('off') plt.tight_layout() fig.savefig(out_file, format="png", dpi=300) plt.close()
def plot_graph(ds: loompy.LoomConnection, out_file: str, tags: List[str] = None) -> None: logging.info("Loading graph") n_cells = ds.shape[1] cells = np.where(ds.col_attrs["_Valid"] == 1)[0] has_edges = False if "MKNN" in ds.list_edges(axis=1): (a, b, w) = ds.get_edges("MKNN", axis=1) has_edges = True pos = np.vstack((ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose() labels = ds.col_attrs["Clusters"] if "Outliers" in ds.col_attrs: outliers = ds.col_attrs["Outliers"] else: outliers = np.zeros(ds.shape[1]) # Compute a good size for the markers, based on local density logging.info("Computing node size") min_pts = 50 eps_pct = 60 nn = NearestNeighbors(n_neighbors=min_pts, algorithm="ball_tree", n_jobs=4) nn.fit(pos) knn = nn.kneighbors_graph(mode='distance') k_radius = knn.max(axis=1).toarray() epsilon = 24 * np.percentile(k_radius, eps_pct) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) # Draw edges if has_edges: logging.info("Drawing edges") lc = LineCollection(zip(pos[a], pos[b]), linewidths=0.25, zorder=0, color='grey', alpha=0.1) ax.add_collection(lc) # Draw nodes logging.info("Drawing nodes") colors20 = np.vstack((plt.cm.Vega20b(np.linspace(0., 1, 20))[::2], plt.cm.Vega20c(np.linspace(0, 1, 20))[1::2])) plots = [] names = [] for i in range(max(labels) + 1): cluster = labels == i n_cells = cluster.sum() if np.all(outliers[labels == i] == 1): edgecolor = colorConverter.to_rgba('red', alpha=.1) plots.append( plt.scatter(x=pos[outliers == 1, 0], y=pos[outliers == 1, 1], c='grey', marker='.', edgecolors=edgecolor, alpha=0.1, s=epsilon)) names.append(f"{i}/n={n_cells} (outliers)") else: plots.append( plt.scatter(x=pos[cluster, 0], y=pos[cluster, 1], c=cg.colors75[np.mod(i, 75)], marker='.', lw=0, s=epsilon, alpha=0.75)) txt = str(i) if "ClusterName" in ds.ca.keys(): txt = ds.ca.ClusterName[ds.ca.Clusters == i][0] if tags is not None: names.append(f"{txt}/n={n_cells} " + tags[i].replace("\n", " ")) else: names.append(f"{txt}/n={n_cells}") logging.info("Drawing legend") plt.legend(plots, names, scatterpoints=1, markerscale=2, loc='upper left', bbox_to_anchor=(1, 1), fancybox=True, framealpha=0.5, fontsize=10) logging.info("Drawing cluster IDs") for lbl in range(0, max(labels) + 1): txt = str(lbl) if "ClusterName" in ds.ca.keys(): txt = ds.ca.ClusterName[ds.ca.Clusters == lbl][0] if np.all(outliers[labels == lbl] == 1): continue if np.sum(labels == lbl) == 0: continue (x, y) = np.median(pos[np.where(labels == lbl)[0]], axis=0) ax.text(x, y, txt, fontsize=12, bbox=dict(facecolor='white', alpha=0.5, ec='none')) logging.info("Saving to file") fig.savefig(out_file, format="png", dpi=144, bbox_inches='tight') plt.close()
def plot_graph_age(ds: loompy.LoomConnection, out_file: str, tags: List[str]) -> None: def parse_age(age: str) -> float: if age == "": return 0 unit, amount = age[0], float(age[1:]) if unit == "P": amount += 19. return amount n_cells = ds.shape[1] valid = ds.col_attrs["_Valid"].astype('bool') (a, b, w) = ds.get_edges("MKNN", axis=1) mknn = sparse.coo_matrix( (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid] sfdp = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :] # The sorting below is to make every circle visible and avoid overlappings in crowded situations orderx = np.argsort(sfdp[:, 0], kind="mergesort") ordery = np.argsort(sfdp[:, 1], kind="mergesort") orderfin = orderx[ordery] sfdp_original = sfdp.copy( ) # still the draw_networkx_edges wants the sfd with respect of the graph `g` # \it is shortcut to avoid resorting the graph sfdp = sfdp[orderfin, :] labels = ds.col_attrs["Clusters"][valid][orderfin] age = np.fromiter(map(parse_age, ds.col_attrs["Age"]), dtype=float)[valid][orderfin] fig = plt.figure(figsize=(10, 10)) g = nx.from_scipy_sparse_matrix(mknn) ax = fig.add_subplot(111) # Draw the KNN graph first, with gray transparent edges nx.draw_networkx_edges(g, pos=sfdp_original, alpha=0.1, width=0.1, edge_color='gray') # Then draw the nodes, colored by label block_colors = plt.cm.nipy_spectral_r((age - 6) / 14.) nx.draw_networkx_nodes(g, pos=sfdp, node_color=block_colors, node_size=10, alpha=0.4, linewidths=0) for lbl in range(0, max(labels) + 1): if np.sum(labels == lbl) == 0: continue (x, y) = np.median(sfdp[np.where(labels == lbl)[0]], axis=0) text = "#" + str(lbl) if len(tags[lbl]) > 0: text += "\n" + tags[lbl] ax.text(x, y, text, fontsize=8, bbox=dict(facecolor='gray', alpha=0.3, ec='none')) ax.axis('off') levels = np.unique(age) for il, lev in enumerate(levels): ax.add_patch( plt.Rectangle((0.90, 0.7 + il * 0.016), 0.014, 0.014, color=plt.cm.nipy_spectral_r((lev - 6) / 14.), clip_on=0, transform=ax.transAxes)) ax.text(0.93, 0.703 + il * 0.016, ("E%.1f" % lev if lev < 18.5 else "P%.1f" % (lev - 19)), transform=ax.transAxes) plt.tight_layout() fig.savefig(out_file, format="png", dpi=300) plt.close()
def plot_classification(ds: loompy.LoomConnection, out_file: str) -> None: n_cells = ds.shape[1] valid = ds.col_attrs["_Valid"].astype('bool') (a, b, w) = ds.get_edges("MKNN", axis=1) mknn = sparse.coo_matrix( (w, (a, b)), shape=(n_cells, n_cells)).tocsr()[valid, :][:, valid] pos = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[valid, :] labels = ds.col_attrs["Clusters"][valid] fig = plt.figure(figsize=(10, 10)) g = nx.from_scipy_sparse_matrix(mknn) classes = [ "Neurons", "Astrocyte", "Ependymal", "OEC", "Oligos", "Schwann", "Cycling", "Vascular", "Immune" ] colors = [plt.cm.get_cmap('Vega20')((ix + 0.5) / 20) for ix in range(20)] combined_colors = np.zeros((ds.shape[1], 4)) + np.array((0.5, 0.5, 0.5, 0)) for ix, cls in enumerate(classes): cmap = LinearSegmentedColormap.from_list('custom cmap', [(1, 1, 1, 0), colors[ix]]) cells = ds.col_attrs["Class0"] == classes[ix] if np.sum(cells) > 0: combined_colors[cells] = [ cmap(x) for x in ds.col_attrs["Class_" + classes[ix]][cells] ] cmap = LinearSegmentedColormap.from_list('custom cmap', [(1, 1, 1, 0), colors[ix + 1]]) ery_color = np.array( [[1, 1, 1, 0], [0.9, 0.71, 0.76, 0]])[(ds.col_attrs["Class"][valid] == "Erythrocyte").astype('int')] cells = ds.col_attrs["Class0"] == "Erythrocyte" if np.sum(cells) > 0: combined_colors[cells] = np.array([1, 0.71, 0.76, 0]) cmap = LinearSegmentedColormap.from_list('custom cmap', [(1, 1, 1, 0), colors[ix + 2]]) exc_color = np.array( [[1, 1, 1, 0], [0.5, 0.5, 0.5, 0]])[(ds.col_attrs["Class0"][valid] == "Excluded").astype('int')] cells = ds.col_attrs["Class0"] == "Excluded" if np.sum(cells) > 0: combined_colors[cells] = np.array([0.5, 0.5, 0.5, 0]) ax = fig.add_subplot(1, 1, 1) ax.set_title("Class") nx.draw_networkx_edges(g, pos=pos, alpha=0.2, width=0.1, edge_color='gray') nx.draw_networkx_nodes(g, pos=pos, node_color=combined_colors[valid], node_size=10, alpha=0.6, linewidths=0) ax.axis('off') plt.tight_layout() fig.savefig(out_file, format="png", dpi=300) plt.close()
def fit_predict(self, ds: loompy.LoomConnection) -> np.ndarray: n_valid = np.sum(ds.col_attrs["_Valid"] == 1) n_total = ds.shape[1] logging.info("%d of %d cells were valid", n_valid, n_total) logging.info("%d of %d genes were valid", np.sum(ds.row_attrs["_Valid"] == 1), ds.shape[0]) cells = np.where(ds.col_attrs["_Valid"] == 1)[0] if self.method == "hdbscan": logging.info("HDBSCAN clustering in t-SNE space") min_pts = 10 if n_valid < 3000 else ( 20 if n_valid < 20000 else 100) tsne_pos = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[cells, :] clusterer = hdbscan.HDBSCAN(min_cluster_size=min_pts) labels = clusterer.fit_predict(tsne_pos) elif self.method == "dbscan": logging.info("DBSCAN clustering in t-SNE space") if self.min_pts is None: self.min_pts = 10 if n_valid < 3000 else ( 20 if n_valid < 20000 else 100) tsne_pos = np.vstack( (ds.col_attrs["_X"], ds.col_attrs["_Y"])).transpose()[cells, :] # Determine a good epsilon nn = NearestNeighbors(n_neighbors=self.min_pts, algorithm="ball_tree", n_jobs=4) nn.fit(tsne_pos) knn = nn.kneighbors_graph(mode='distance') k_radius = knn.max(axis=1).toarray() epsilon = np.percentile(k_radius, self.eps_pct) clusterer = DBSCAN(eps=epsilon, min_samples=self.min_pts) labels = clusterer.fit_predict(tsne_pos) if not self.outliers: # Assign each outlier to the same cluster as the nearest non-outlier nn = NearestNeighbors(n_neighbors=50, algorithm="ball_tree") nn.fit(tsne_pos[labels >= 0]) nearest = nn.kneighbors(tsne_pos[labels == -1], n_neighbors=1, return_distance=False) labels[labels == -1] = labels[labels >= 0][nearest.flat[:]] elif self.method == "multilev": logging.info( "comunity-multilevel clustering on unweighted KNN graph") (a, b, w) = ds.get_edges("KNN", axis=1) # knn = sparse.coo_matrix((w, (a, b)), shape=(ds.shape[1], ds.shape[1])).tocsr()[cells, :][:, cells] # sources, targets = knn.nonzero() G = igraph.Graph(n_total, list(zip(a, b)), directed=False) VxCl = G.community_multilevel(return_levels=False) labels = np.array(VxCl.membership) elif self.method == "wmultilev": logging.info( "comunity-multilevel clustering on the multiscale KNN graph") (a, b, w) = ds.get_edges("KNN", axis=1) # knn = sparse.coo_matrix((w, (a, b)), shape=(ds.shape[1], ds.shape[1])).tocsr()[cells, :][:, cells] # a, b = knn.nonzero() G = igraph.Graph(n_total, list(zip(a, b)), directed=False, edge_attrs={'weight': w}) VxCl = G.community_multilevel(return_levels=False, weights="weight") labels = np.array(VxCl.membership) elif self.method == "mknn_louvain": logging.info( "comunity-multilevel clustering on the multiscale MKNN graph") (a, b, w) = ds.get_edges("MKNN", axis=1) random.seed(13) igraph._igraph.set_random_number_generator(random) G = igraph.Graph(n_total, list(zip(a, b)), directed=False, edge_attrs={'weight': w}) VxCl = G.community_multilevel(return_levels=False, weights="weight") labels = np.array(VxCl.membership) logging.info(f"labels.shape = {labels.shape}") if not self.outliers: bigs = np.where(np.bincount(labels) >= 0)[0] else: bigs = np.where(np.bincount(labels) >= self.min_pts)[0] mapping = {k: v for v, k in enumerate(bigs)} labels = np.array( [mapping[x] if x in bigs else -1 for x in labels]) else: logging.info("Louvain clustering on the multiscale KNN graph") (a, b, w) = ds.get_edges("KNN", axis=1) knn = sparse.coo_matrix( (w, (a, b)), shape=(ds.shape[1], ds.shape[1])).tocsr()[cells, :][:, cells] lj = cg.LouvainJaccard(resolution=1, jaccard=False) labels = lj.fit_predict(knn.tocoo()) # At this point, cells should be labeled 0, 1, 2, ... # But there may also be cells labelled -1 for outliers, which we want to keep track of labels_all = np.zeros(ds.shape[1], dtype='int') outliers = np.zeros(ds.shape[1], dtype='int') labels_all[cells] = labels outliers[labels_all == -1] = 1 labels_all[cells] = labels - np.min(labels) ds.ca.Clusters = labels_all ds.ca.Outliers = outliers logging.info("Found " + str(max(labels_all) + 1) + " clusters") if not len(set(ds.ca.Clusters)) == ds.ca.Clusters.max() + 1: raise ValueError("There are holes in the cluster ID sequence!") return labels_all