def supervised_umap(ax, manifolds, y, cutoff=50): x = manifolds[2].reset_index(drop=True) label = y['risk_score'] c = label < cutoff m = UMAP(target_metric = "categorical").fit_transform(x, c) m = m.reset_index(drop=True).to_pandas() c = c.reset_index(drop=True).to_pandas() _=ax.scatter(m.loc[c].values[:,0],m[c].values[:,1],alpha=1,s=3,c='blue',label='Low Patient Risk Score (< {})'.format(cutoff)) _=ax.scatter(m.loc[~c].values[:,0],m.loc[~c].values[:,1],alpha=1,s=3,c='red',label='High Patient Risk Score (> {})'.format(cutoff)) ax.set_title('cutoff = {}%'.format(cutoff))
def umap(adata, min_dist=0.5, spread=1.0, n_components=2, maxiter=None, alpha=1.0, gamma=1.0, negative_sample_rate=5, init_pos='spectral', random_state=0, a=None, b=None, copy=False, method='umap'): """Embed the neighborhood graph using UMAP [McInnes18]_. UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of `umap-learn <https://github.com/lmcinnes/umap>`__ [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint <https://doi.org/10.1101/298430>`__. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. min_dist : `float`, optional (default: 0.5) The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. The default of in the `umap-learn` package is 0.1. spread : `float` (optional, default 1.0) The effective scale of embedded points. In combination with `min_dist` this determines how clustered/clumped the embedded points are. n_components : `int`, optional (default: 2) The number of dimensions of the embedding. maxiter : `int`, optional (default: `None`) The number of iterations (epochs) of the optimization. Called `n_epochs` in the original UMAP. alpha : `float`, optional (default: 1.0) The initial learning rate for the embedding optimization. gamma : `float` (optional, default 1.0) Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. negative_sample_rate : `int` (optional, default 5) The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding. init_pos : `string` or `np.array`, optional (default: 'spectral') How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are: * Any key for `adata.obsm`. * 'paga': positions from :func:`~scanpy.pl.paga`. * 'spectral': use a spectral embedding of the graph. * 'random': assign initial embedding positions at random. * A numpy array of initial embedding positions. random_state : `int`, `RandomState` or `None`, optional (default: 0) If `int`, `random_state` is the seed used by the random number generator; If `RandomState`, `random_state` is the random number generator; If `None`, the random number generator is the `RandomState` instance used by `np.random`. a : `float` (optional, default `None`) More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. b : `float` (optional, default `None`) More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. copy : `bool` (default: `False`) Return a copy instead of writing to adata. method : {`'umap'`, `'rapids'`} (default: `'umap'`) Use the original 'umap' implementation, or 'rapids' (experimental, GPU only) Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **X_umap** : `adata.obsm` field UMAP coordinates of data. """ adata = adata.copy() if copy else adata if 'neighbors' not in adata.uns: raise ValueError( 'Did not find \'neighbors/connectivities\'. Run `sc.pp.neighbors` first.' ) start = logg.info('computing UMAP') if ('params' not in adata.uns['neighbors'] or adata.uns['neighbors']['params']['method'] != 'umap'): logg.warning( 'neighbors/connectivities have not been computed using umap') from umap.umap_ import find_ab_params, simplicial_set_embedding if a is None or b is None: a, b = find_ab_params(spread, min_dist) else: a = a b = b if isinstance(init_pos, str) and init_pos in adata.obsm.keys(): init_coords = adata.obsm[init_pos] elif isinstance(init_pos, str) and init_pos == 'paga': init_coords = get_init_pos_from_paga(adata, random_state=random_state) else: init_coords = init_pos # Let umap handle it if hasattr(init_coords, "dtype"): init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False) random_state = check_random_state(random_state) neigh_params = adata.uns['neighbors']['params'] X = _choose_representation(adata, neigh_params.get('use_rep', None), neigh_params.get('n_pcs', None), silent=True) if method == 'umap': # the data matrix X is really only used for determining the number of connected components # for the init condition in the UMAP embedding n_epochs = 0 if maxiter is None else maxiter X_umap = simplicial_set_embedding( X, adata.uns['neighbors']['connectivities'].tocoo(), n_components, alpha, a, b, gamma, negative_sample_rate, n_epochs, init_coords, random_state, neigh_params.get('metric', 'euclidean'), neigh_params.get('metric_kwds', {}), verbose=settings.verbosity > 3, ) elif method == 'rapids': metric = neigh_params.get('metric', 'euclidean') if metric != 'euclidean': raise ValueError( f'`sc.pp.neighbors` was called with `metric` {metric!r}, ' "but umap `method` 'rapids' only supports the 'euclidean' metric." ) from cuml import UMAP n_neighbors = adata.uns['neighbors']['params']['n_neighbors'] n_epochs = 500 if maxiter is None else maxiter # 0 is not a valid value for rapids, unlike original umap X_contiguous = np.ascontiguousarray(X, dtype=np.float32) umap = UMAP( n_neighbors=n_neighbors, n_components=n_components, n_epochs=n_epochs, learning_rate=alpha, init=init_pos, min_dist=min_dist, spread=spread, negative_sample_rate=negative_sample_rate, a=a, b=b, verbose=settings.verbosity > 3, ) X_umap = umap.fit_transform(X_contiguous) adata.obsm['X_umap'] = X_umap # annotate samples with UMAP coordinates logg.info( ' finished', time=start, deep=('added\n' " 'X_umap', UMAP coordinates (adata.obsm)"), ) return adata if copy else None
def __init__(self, df, n_clusters, chembl_ids): self.app = dash.Dash( __name__, external_stylesheets=external_stylesheets) self.df = df self.n_clusters = n_clusters self.chembl_ids = chembl_ids # Fetch relavant properties from database. self.prop_df = self.create_dataframe_molecule_properties(chembl_ids) self.df['chembl_id'] = chembl_ids self.df['id'] = self.df.index self.orig_df = df.copy() # initialize UMAP self.umap = UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) # Construct the UI self.app.layout = self.constuct_layout() # Register callbacks for selection inside main figure self.app.callback( [Output('selected_clusters', 'value'), Output('selected_point_cnt', 'children')], [Input('main-figure', 'clickData'), Input('main-figure', 'selectedData'), Input('bt_recluster_clusters', 'n_clicks'), Input('bt_recluster_points', 'n_clicks'), Input('northstar_cluster', 'children')], [State("selected_clusters", "value")]) (self.handle_data_selection) # Register callbacks for buttons for reclustering selected data self.app.callback( [Output('main-figure', 'figure'), Output('northstar_cluster', 'children')], [Input('bt_recluster_clusters', 'n_clicks'), Input('bt_recluster_points', 'n_clicks'), Input('bt_north_star', 'n_clicks'), Input('north_star', 'value'), Input('sl_prop_gradient', 'value'), Input('sl_nclusters', 'value')], [State("selected_clusters", "value"), State("main-figure", "selectedData")]) (self.handle_re_cluster) # Register callbacks for selection inside main figure to update module details self.app.callback( [Output('tb_selected_molecules', 'children'), Output('sl_mol_props', 'options'), Output("current_page", "children"), Output("total_page", "children"), Output('section_molecule_details', 'style')], [Input('main-figure', 'selectedData'), Input('sl_mol_props', 'value'), Input('sl_prop_gradient', 'value'), Input("bt_page_prev", "n_clicks"), Input("bt_page_next", "n_clicks")], State("current_page", "children")) (self.handle_molecule_selection) self.app.callback( Output("hidden1", "children"), [Input("bt_reset", "n_clicks")]) (self.handle_reset) self.app.callback( Output('north_star', 'value'), [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')], State('north_star', 'value')) \ (self.handle_mark_north_star)
class ChemVisualization: def __init__(self, df, n_clusters, chembl_ids): self.app = dash.Dash( __name__, external_stylesheets=external_stylesheets) self.df = df self.n_clusters = n_clusters self.chembl_ids = chembl_ids # Fetch relavant properties from database. self.prop_df = self.create_dataframe_molecule_properties(chembl_ids) self.df['chembl_id'] = chembl_ids self.df['id'] = self.df.index self.orig_df = df.copy() # initialize UMAP self.umap = UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) # Construct the UI self.app.layout = self.constuct_layout() # Register callbacks for selection inside main figure self.app.callback( [Output('selected_clusters', 'value'), Output('selected_point_cnt', 'children')], [Input('main-figure', 'clickData'), Input('main-figure', 'selectedData'), Input('bt_recluster_clusters', 'n_clicks'), Input('bt_recluster_points', 'n_clicks'), Input('northstar_cluster', 'children')], [State("selected_clusters", "value")]) (self.handle_data_selection) # Register callbacks for buttons for reclustering selected data self.app.callback( [Output('main-figure', 'figure'), Output('northstar_cluster', 'children')], [Input('bt_recluster_clusters', 'n_clicks'), Input('bt_recluster_points', 'n_clicks'), Input('bt_north_star', 'n_clicks'), Input('north_star', 'value'), Input('sl_prop_gradient', 'value'), Input('sl_nclusters', 'value')], [State("selected_clusters", "value"), State("main-figure", "selectedData")]) (self.handle_re_cluster) # Register callbacks for selection inside main figure to update module details self.app.callback( [Output('tb_selected_molecules', 'children'), Output('sl_mol_props', 'options'), Output("current_page", "children"), Output("total_page", "children"), Output('section_molecule_details', 'style')], [Input('main-figure', 'selectedData'), Input('sl_mol_props', 'value'), Input('sl_prop_gradient', 'value'), Input("bt_page_prev", "n_clicks"), Input("bt_page_next", "n_clicks")], State("current_page", "children")) (self.handle_molecule_selection) self.app.callback( Output("hidden1", "children"), [Input("bt_reset", "n_clicks")]) (self.handle_reset) self.app.callback( Output('north_star', 'value'), [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')], State('north_star', 'value')) \ (self.handle_mark_north_star) def MorganFromSmiles(self, smiles, radius=2, nBits=512): m = Chem.MolFromSmiles(smiles) fp = AllChem.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=nBits) ar = cupy.array(fp) return ar def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None): if gdf.shape[0] == 0: return None # Before reclustering remove all columns that may interfere ids = gdf['id'] chembl_ids = gdf['chembl_id'] gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], inplace=True) if new_figerprints is not None and new_chembl_ids is not None: # Add new figerprints and chEmblIds before reclustering fp_df = cudf.DataFrame(new_figerprints, columns=gdf.columns) gdf = gdf.append(fp_df, ignore_index=True) chembl_ids = chembl_ids.append( cudf.Series(new_chembl_ids), ignore_index=True) kmeans_float = KMeans(n_clusters=self.n_clusters) kmeans_float.fit(gdf) Xt = self.umap.fit_transform(gdf) # Add back the column required for plotting and to correlating data # between re-clustering gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('id', gdf.index) gdf.add_column('chembl_id', chembl_ids) gdf.add_column('cluster', kmeans_float.labels_.to_array()) return gdf def recluster_nofilter(self, df, gradient_prop, north_stars=None): tdf = self.re_cluster(df) if tdf is not None: self.df = tdf return self.create_graph(self.df, color_col='cluster', gradient_prop=gradient_prop, north_stars=north_stars) def recluster_selected_clusters(self, df, values, gradient_prop, north_stars=None): df_clusters = df['cluster'].isin(values) filters = df_clusters.values tdf = df[filters.get()] tdf = self.re_cluster(tdf) if tdf is not None: self.df = tdf return self.create_graph(self.df, color_col='cluster', gradient_prop=gradient_prop, north_stars=north_stars) def recluster_selected_points(self, df, values, gradient_prop, north_stars=None): df_clusters = df['id'].isin(values) filters = df_clusters.values tdf = df[filters.get()] tdf = self.re_cluster(tdf) if tdf is not None: self.df = tdf return self.create_graph(self.df, color_col='cluster', gradient_prop=gradient_prop, north_stars=north_stars) def create_graph(self, df, color_col='cluster', north_stars=None, gradient_prop=None): fig = go.Figure(layout = {'colorscale' : {}}) ldf = df.merge(self.prop_df, on='chembl_id') cmin = cmax = None if gradient_prop is not None: cmin = ldf[gradient_prop].min() cmax = ldf[gradient_prop].max() north_points = [] if north_stars: for chemblid in north_stars.split(","): chemblid = chemblid.strip() if chemblid in self.chembl_ids: north_points.append(self.chembl_ids.index(chemblid)) northstar_cluster = [] for cluster_id in ldf[color_col].unique().values_host: query = 'cluster == ' + str(cluster_id) cdf = ldf.query(query) df_size = cdf['id'].isin(north_points) if df_size.unique().shape[0] > 1: northstar_cluster.append(str(cluster_id)) # Compute size of northstar and normal points df_shape = df_size.copy() df_size = (df_size * 18) + 6 df_shape = df_shape * 2 if gradient_prop is not None: fig.add_trace( go.Scattergl({ 'x': cdf['x'].to_array(), 'y': cdf['y'].to_array(), 'text': cdf['chembl_id'].to_array(), 'customdata': cdf['id'].to_array(), 'name': 'Cluster ' + str(cluster_id), 'mode': 'markers', 'showlegend': False, 'marker': { 'size': df_size.to_array(), 'symbol': df_shape.to_array(), 'color': cdf[gradient_prop].to_array(), 'colorscale': 'Viridis', 'showscale': True, 'cmin': cmin, 'cmax': cmax, } })) else: fig.add_trace( go.Scattergl({ 'x': cdf['x'].to_array(), 'y': cdf['y'].to_array(), 'text': cdf['chembl_id'].to_array(), 'customdata': cdf['id'].to_array(), 'name': 'Cluster ' + str(cluster_id), 'mode': 'markers', 'marker': { 'size': df_size.to_array(), 'symbol': df_shape.to_array(), } })) fig.update_layout( showlegend=True, clickmode='event', height=main_fig_height, title='Clusters', dragmode='select', annotations=[ dict(x=0.5, y=-0.07, showarrow=False, text='x', xref="paper", yref="paper"), dict(x=-0.05, y=0.5, showarrow=False, text="y", textangle=-90, xref="paper", yref="paper")]) del ldf return fig, northstar_cluster def start(self, host=None, port=5000): return self.app.run_server( debug=False, use_reloader=False, host=host, port=port) def href_ify(self, chemblid): return html.A(chemblid, href='https://www.ebi.ac.uk/chembl/compound_report_card/' + chemblid, target='_blank') #TODO: remove self.selected_chembl_id def construct_molecule_detail(self, selected_points, display_properties, page, pageSize=10): # Create Table header table_headers = [html.Th("Molecular Structure", style={'width': '30%'}), html.Th("Chembl"), html.Th("smiles")] for prop in display_properties: table_headers.append(html.Th(prop)) table_headers.append(html.Th("")) prop_recs = [html.Tr(table_headers)] selected_chembl_ids = [] for point in selected_points['points'][((page-1)*pageSize + 1): page * pageSize]: selected_chembl_ids.append(point['text']) props, selected_molecules = self.fetch_molecule_properties(selected_chembl_ids) all_props = [] for k in props: all_props.append({"label": k, "value": k}) for selected_molecule in selected_molecules: td = [] selected_chembl_id = selected_molecule[0] smiles = selected_molecule[props.index('canonical_smiles')] mol = selected_molecule[props.index('molfile')] m = Chem.MolFromMolBlock(mol) drawer = Draw.rdMolDraw2D.MolDraw2DCairo(400, 200) drawer.SetFontSize(1.0) drawer.DrawMolecule(m) drawer.FinishDrawing() img_binary="data:image/png;base64," + \ base64.b64encode(drawer.GetDrawingText()).decode("utf-8") td.append(html.Img(src=img_binary)) td.append(html.Td(self.href_ify(selected_chembl_id))) td.append(html.Td(smiles)) for key in display_properties: td.append(html.Td(selected_molecule[props.index(key)])) td.append(html.Td( dbc.Button('Add to MoI', \ id={'role': 'bt_star_candidate', 'index': selected_chembl_id}, n_clicks=0) )) prop_recs.append(html.Tr(td)) return html.Table(prop_recs, style={'width': '100%'}), all_props def constuct_layout(self): fig, northstart_cluster = self.create_graph(self.df) return html.Div([ html.Div(className='row', children=[ html.Div([dcc.Graph(id='main-figure', figure=fig),], className='nine columns', style={'verticalAlign': 'text-top',}), html.Div([ html.Div(children=[ dcc.Markdown(""" **Molecule(s) of Interest (MoI)** Please enter Chembl id."""), ]), html.Div(className='row', children=[ dcc.Input(id='north_star', type='text', debounce=True), dbc.Button('Highlight', id='bt_north_star', n_clicks=0, style={'marginLeft': 6,}), ], style={'marginLeft': 0, 'marginTop': 18,}), html.Div(id='section_nclusters', children=[ html.Label([ "Set number of clusters", dcc.Dropdown(id='sl_nclusters', multi=False, options=[{"label": p, "value": p} for p in range(2,10)], value=self.n_clusters ), ], style={'marginTop': 6})], ), html.Div(children=[ dcc.Markdown(""" **Cluster Selection** Click a point to select a cluster. """)], style={'marginTop': 18,}), html.Div(className='row', children=[ dcc.Input(id='selected_clusters', type='text'), dbc.Button('Recluster', id='bt_recluster_clusters', n_clicks=0, style={'marginLeft': 6,}), ], style={'marginLeft': 0, 'marginTop': 18,}), html.Div(children=[ dcc.Markdown(""" **Selection Points** Choose the lasso or rectangle tool in the graph's menu bar and then select points in the graph. """),], style={'marginTop': 18,}), dbc.Button('Recluster Selection', id='bt_recluster_points', n_clicks=0), html.Div(children=[ html.Div(id='selected_point_cnt'),]), html.Div(className='row', children=[ html.Div(children=[ dbc.Button("Close", id="bt_close"), dbc.Modal([ dbc.ModalHeader("Close"), dbc.ModalBody( dcc.Markdown(""" Dashboard closed. Please return to the notebook. """), ), dbc.ModalFooter(dbc.Button("Close", id="bt_close_dash", className="ml-auto")), ], id="md_export"), ]), html.Div(children=[html.A(dbc.Button('Reload', id='bt_reset'), href='/'),], style={'marginLeft': 18,}), ], style={'marginLeft': 0, 'marginTop': 18,}), html.Div(id='section_prop_gradient', children=[ html.Label([ "Select Molecular Property for color gradient", dcc.Dropdown(id='sl_prop_gradient', multi=False, options=[{"label": p, "value": p} for p in IMP_PROPS],), ], style={'marginTop': 18})], ), ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top',}), ]), html.Div(id='section_molecule_details', className='row', children=[ html.Div(className='row', children=[ html.Div(id='section_display_properties', children=[ html.Label([ "Select Molecular Properties", dcc.Dropdown(id='sl_mol_props', multi=True, options=[{'label': 'alogp', 'value': 'alogp'}], value=['alogp']), ], style={'marginLeft': 60})], className='nine columns', ), html.Div(children=[ dbc.Button("<", id="bt_page_prev", style={"height": "25px"}), html.Span(children=1, id='current_page', style={"paddingLeft": "6px"}), html.Span(children=' of 1', id='total_page', style={"paddingRight": "6px"}), dbc.Button(">", id="bt_page_next", style={"height": "25px"}) ], className='three columns', style={'paddingRight': 60, 'verticalAlign': 'text-bottom', 'text-align': 'right'} ), ]), html.Div(className='row', children=[ html.Div(id='tb_selected_molecules', children=[], style={'marginLeft': 60, 'verticalAlign': 'text-top'} ), ]) ], style={'display':'none'}), html.Div(id='hidden1', style={'display':'none'}), html.Div(id='northstar_cluster', style={'display':'none'}) ]) def handle_reset(self, recluster_nofilter): self.df = self.orig_df.copy() def handle_molecule_selection(self, mf_selected_data, selected_columns, sl_prop_gradient, prev_click, next_click, current_page): if not dash.callback_context.triggered or not mf_selected_data: raise dash.exceptions.PreventUpdate comp_id, event_type = \ dash.callback_context.triggered[0]['prop_id'].split('.') module_details = None # Code to support pagination if comp_id == 'bt_page_prev' and event_type == 'n_clicks': if current_page == 1: raise dash.exceptions.PreventUpdate current_page -= 1 elif comp_id == 'bt_page_next' and event_type == 'n_clicks': if len(mf_selected_data['points']) < PAGE_SIZE * (current_page + 1): raise dash.exceptions.PreventUpdate current_page += 1 if selected_columns and sl_prop_gradient: if sl_prop_gradient not in selected_columns: selected_columns.append(sl_prop_gradient) module_details, all_props = self.construct_molecule_detail( mf_selected_data, selected_columns, current_page, pageSize=PAGE_SIZE) last_page = ' of ' + str(len(mf_selected_data['points'])//PAGE_SIZE) return module_details, all_props, current_page, last_page, {'display':'block'} def handle_data_selection(self, mf_click_data, mf_selected_data, bt_cluster_clicks, bt_point_clicks, northstar_cluster, curr_clusters): if not dash.callback_context.triggered: raise dash.exceptions.PreventUpdate comp_id, event_type = \ dash.callback_context.triggered[0]['prop_id'].split('.') selected_clusters = '' selected_point_cnt = '' if comp_id == 'main-figure' and event_type == 'clickData': # Event - On selecting cluster on the main scatter plot if not curr_clusters: clusters = [] else: clusters = list(map(int, curr_clusters.split(","))) points = mf_click_data['points'] for point in points: cluster = point['curveNumber'] if cluster in clusters: clusters.remove(cluster) else: clusters.append(cluster) selected_clusters = ','.join(map(str, clusters)) elif comp_id == 'main-figure' and event_type == 'selectedData': # Event - On selection on the main scatterplot if not mf_selected_data: raise dash.exceptions.PreventUpdate points = mf_selected_data['points'] selected_point_cnt = str(len(points)) + ' points selected' clusters = {point['curveNumber'] for point in points} # selected_clusters = ','.join(map(str, clusters)) selected_clusters = northstar_cluster elif comp_id == 'northstar_cluster' and event_type == 'children': selected_clusters = northstar_cluster elif (comp_id == 'bt_recluster_clusters' and event_type == 'n_clicks') \ or (comp_id == 'bt_recluster_points' and event_type == 'n_clicks'): selected_clusters = northstar_cluster else: raise dash.exceptions.PreventUpdate return selected_clusters, selected_point_cnt def handle_mark_north_star(self, bt_north_star_click, north_star): if not dash.callback_context.triggered: raise dash.exceptions.PreventUpdate comp_id, event_type = \ dash.callback_context.triggered[0]['prop_id'].split('.') if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0: raise dash.exceptions.PreventUpdate if not north_star: selected_north_star = [] else: selected_north_star = north_star.split(",") comp_detail = json.loads(comp_id) selected_chembl_id = comp_detail['index'] if selected_chembl_id not in selected_north_star and \ selected_chembl_id in self.chembl_ids: selected_north_star.append(selected_chembl_id) return ','.join(selected_north_star) def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks, north_star, sl_prop_gradient, sl_nclusters, curr_clusters, mf_selected_data): if not dash.callback_context.triggered: raise dash.exceptions.PreventUpdate comp_id, event_type = \ dash.callback_context.triggered[0]['prop_id'].split('.') self.n_clusters = sl_nclusters if comp_id == 'bt_recluster_clusters' and event_type == 'n_clicks': if not curr_clusters: figure, northstar_cluster = self.recluster_nofilter( self.df, sl_prop_gradient, north_stars=north_star) else: clusters = list(map(int, curr_clusters.split(","))) figure, northstar_cluster = self.recluster_selected_clusters( self.df, clusters, sl_prop_gradient, north_stars=north_star) elif comp_id == 'bt_recluster_points' and event_type == 'n_clicks': if not mf_selected_data: figure, northstar_cluster = self.recluster_nofilter( self.df, sl_prop_gradient, north_stars=north_star) else: points = [] for point in mf_selected_data['points']: points.append(point['customdata']) figure, northstar_cluster = self.recluster_selected_points( self.df, points, sl_prop_gradient, north_stars=north_star) elif (comp_id == 'bt_north_star' and event_type == 'n_clicks') or \ (comp_id == 'sl_prop_gradient' and event_type == 'value'): figure, northstar_cluster = self.create_graph( self.df, gradient_prop=sl_prop_gradient, north_stars=north_star) elif (comp_id == 'north_star' and event_type == 'value'): north_star = self.update_new_chembl(north_star) if north_star: figure, northstar_cluster = self.create_graph( self.df, gradient_prop=sl_prop_gradient, north_stars=north_star) else: raise dash.exceptions.PreventUpdate else: raise dash.exceptions.PreventUpdate return figure, ','.join(northstar_cluster) def update_new_chembl(self, north_stars, radius=2, nBits=512): north_stars = list(map(str.strip, north_stars.split(','))) north_stars = list(map(str.upper, north_stars)) missing_chembl = set(north_stars).difference(self.chembl_ids) # CHEMBL10307, CHEMBL103071, CHEMBL103072 if missing_chembl: missing_chembl = list(missing_chembl) ldf = self.create_dataframe_molecule_properties(missing_chembl) if ldf.shape[0] > 0: self.prop_df = self.prop_df.append(ldf) self.chembl_ids.extend(missing_chembl) smiles = [] for i in range(0, ldf.shape[0]): smiles.append(ldf.iloc[i]['canonical_smiles'].to_array()[0]) results = list(map(self.MorganFromSmiles, smiles)) fingerprints = cupy.stack(results).astype(np.float32) tdf = self.re_cluster(self.df, fingerprints, missing_chembl) if tdf is not None: self.df = tdf else: return None return ','.join(north_stars) def fetch_molecule_properties(self, chemblIDs): with closing(sqlite3.connect(CHEMBL_DB)) as con, con, \ closing(con.cursor()) as cur: select_stmt = ''' SELECT md.chembl_id, cp.*, cs.* FROM compound_properties cp, compound_structures cs, molecule_dictionary md WHERE cp.molregno = md.molregno AND md.molregno = cs.molregno AND md.chembl_id in (%s); ''' % "'%s'" %"','".join(chemblIDs) cur.execute(select_stmt) cols = list(map(lambda x: x[0], cur.description)) return cols, cur.fetchall() def create_dataframe_molecule_properties(self, chemblIDs): with closing(sqlite3.connect(CHEMBL_DB)) as con, con, \ closing(con.cursor()) as cur: select_stmt = ''' SELECT md.chembl_id, cp.*, cs.* FROM compound_properties cp, molecule_dictionary md, compound_structures cs WHERE cp.molregno = md.molregno AND md.molregno = cs.molregno AND md.chembl_id in (%s); ''' % "'%s'" %"','".join(chemblIDs) df = cudf.from_pandas(pd.read_sql(select_stmt, con)) return df.sort_values('chembl_id')
logger.info('Initializing Morgan fingerprints...') results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute() np_array_fingerprints = np.stack(results).astype(np.float32) # take np.array shape (n_mols, nBits) for GPU DataFrame gdf = np2cudf(np_array_fingerprints) # prepare one set of clusters n_clusters = 7 kmeans_float = KMeans(n_clusters=n_clusters) kmeans_float.fit(gdf) # UMAP umap = UMAP(n_neighbors=100, a=1.0, b=1.0, learning_rate=1.0) Xt = umap.fit_transform(gdf) gdf.add_column('x', Xt[0].to_array()) gdf.add_column('y', Xt[1].to_array()) gdf.add_column('cluster', kmeans_float.labels_) # start dash v = chemvisualize.ChemVisualization( gdf.copy(), n_clusters, chemblID_list) logger.info('navigate to https://localhost:5000') v.start('0.0.0.0')
def draw_sampels(model, dataset, original_labels, algorithm, log_dir, seed, ind_path, save_dir): if isinstance(log_dir, str): log_dir = Path(log_dir) if isinstance(save_dir, str): save_dir = Path(save_dir) os.makedirs(save_dir, exist_ok=True) algorithm_name = get_algorithms_name(log_dir, [algorithm])[0] model_dir = log_dir / algorithm_name / str(seed) / "model0" idx_path = model_dir / "selected_idx.pkl" selected_idxs = pickle.loads(Path(idx_path).read_bytes()) pt_path = model_dir / "best_model.pt" model.load_state_dict(torch.load(pt_path)) noise_ind = set(np.load(ind_path)) selected_idx = set(selected_idxs[-1]) labels = np.array(original_labels) fixed_train_dataloader = DataLoader(dataset, batch_size=256, shuffle=False, num_workers=8) model.eval() X = [] with torch.no_grad(): for i, (images, _, _) in enumerate(fixed_train_dataloader): if torch.cuda.is_available(): images = images.to("cuda:0") output = model(images) output = output.cpu().numpy() X.append(output) X = np.concatenate(X, axis=0) tsne = UMAP(n_components=2, random_state=0) embedding = tsne.fit_transform(X) emb1 = embedding[:, 0] emb2 = embedding[:, 1] targets = ["clean", "noise", "select", "nonsel"] multi_fig = make_subplots(rows=2, cols=2, vertical_spacing=0.05, horizontal_spacing=0.02) min_x, max_x = [], [] min_y, max_y = [], [] for n, target in enumerate(targets): fig = go.Figure() class_min_x, class_max_x = np.inf, 0 class_min_y, class_max_y = np.inf, 0 for i in range(len(dataset.classes)): class_ind = set(np.where(labels == i)[0]) if target == "noise": target_ind = class_ind & noise_ind elif target == "clean": target_ind = class_ind - noise_ind elif target == "select": target_ind = class_ind & selected_idx elif target == "nonsel": target_ind = class_ind - selected_idx X = emb1[list(target_ind)] Y = emb2[list(target_ind)] if len(target_ind) > 0: class_min_x = min(min(X), class_min_x) class_max_x = max(max(X), class_max_x) class_min_y = min(min(Y), class_min_y) class_max_y = max(max(Y), class_max_y) if len(dataset.classes) <= 10: scatter = go.Scattergl(x=X, y=Y, mode="markers") else: color = hsv_to_rgb(360 / len(dataset.classes) * i, 1, 1) scatter = go.Scattergl( x=X, y=Y, mode="markers", marker_color=f"rgb({color[0]},{color[1]},{color[2]})", ) fig.add_trace(scatter) multi_fig.add_trace( scatter, row=n // 2 + 1, col=n % 2 + 1, ) min_x.append(class_min_x) max_x.append(class_max_x) min_y.append(class_min_y) max_y.append(class_max_y) fig.update_layout( autosize=False, width=1200, height=800, margin=go.layout.Margin( l=0, # left margin r=0, # right margin b=0, # bottom margin t=0, # top margin ), showlegend=False, ) fig.update_xaxes(showticklabels=False) fig.update_yaxes(showticklabels=False) # fig.write_html(f"{save_dir}/umap-{target}.html") fig.write_image(f"{save_dir}/umap-{target}.pdf") multi_fig.update_layout( autosize=False, width=3200, height=2400, margin=go.layout.Margin( l=0, # left margin r=0, # right margin b=100, # bottom margin t=0, # top margin ), font=dict(family="Arial", size=84), showlegend=False, ) multi_fig.update_xaxes(title_text="Clean samples", row=1, col=1) multi_fig.update_xaxes(title_text="Incorrectly labeled samples", row=1, col=2) multi_fig.update_xaxes(title_text="Selected samples", row=2, col=1) multi_fig.update_xaxes(title_text="Non-selected samples", row=2, col=2) # scaling multi_fig.update_xaxes(showticklabels=False, range=[max(min_x), min(max_x)]) multi_fig.update_yaxes(showticklabels=False, range=[max(min_y), min(max_y)]) # multi_fig.write_html(f"{save_dir}/umap.html") multi_fig.write_image(f"{save_dir}/umap.pdf")
def umap( adata: AnnData, min_dist: float = 0.5, spread: float = 1.0, n_components: int = 2, maxiter: Optional[int] = None, alpha: float = 1.0, gamma: float = 1.0, negative_sample_rate: int = 5, init_pos: Union[_InitPos, np.ndarray, None] = 'spectral', random_state: AnyRandom = 0, a: Optional[float] = None, b: Optional[float] = None, copy: bool = False, method: Literal['umap', 'rapids'] = 'umap', neighbors_key: Optional[str] = None, ) -> Optional[AnnData]: """\ Embed the neighborhood graph using UMAP [McInnes18]_. UMAP (Uniform Manifold Approximation and Projection) is a manifold learning technique suitable for visualizing high-dimensional data. Besides tending to be faster than tSNE, it optimizes the embedding such that it best reflects the topology of the data, which we represent throughout Scanpy using a neighborhood graph. tSNE, by contrast, optimizes the distribution of nearest-neighbor distances in the embedding such that these best match the distribution of distances in the high-dimensional space. We use the implementation of `umap-learn <https://github.com/lmcinnes/umap>`__ [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint <https://doi.org/10.1101/298430>`__. Parameters ---------- adata Annotated data matrix. min_dist The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. The default of in the `umap-learn` package is 0.1. spread The effective scale of embedded points. In combination with `min_dist` this determines how clustered/clumped the embedded points are. n_components The number of dimensions of the embedding. maxiter The number of iterations (epochs) of the optimization. Called `n_epochs` in the original UMAP. alpha The initial learning rate for the embedding optimization. gamma Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. negative_sample_rate The number of negative edge/1-simplex samples to use per positive edge/1-simplex sample in optimizing the low dimensional embedding. init_pos How to initialize the low dimensional embedding. Called `init` in the original UMAP. Options are: * Any key for `adata.obsm`. * 'paga': positions from :func:`~scanpy.pl.paga`. * 'spectral': use a spectral embedding of the graph. * 'random': assign initial embedding positions at random. * A numpy array of initial embedding positions. random_state If `int`, `random_state` is the seed used by the random number generator; If `RandomState` or `Generator`, `random_state` is the random number generator; If `None`, the random number generator is the `RandomState` instance used by `np.random`. a More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. b More specific parameters controlling the embedding. If `None` these values are set automatically as determined by `min_dist` and `spread`. copy Return a copy instead of writing to adata. method Use the original 'umap' implementation, or 'rapids' (experimental, GPU only) neighbors_key If not specified, umap looks .uns['neighbors'] for neighbors settings and .obsp['connectivities'] for connectivities (default storage places for pp.neighbors). If specified, umap looks .uns[neighbors_key] for neighbors settings and .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities. Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **X_umap** : `adata.obsm` field UMAP coordinates of data. """ adata = adata.copy() if copy else adata if neighbors_key is None: neighbors_key = 'neighbors' if neighbors_key not in adata.uns: raise ValueError( f'Did not find .uns["{neighbors_key}"]. Run `sc.pp.neighbors` first.' ) start = logg.info('computing UMAP') neighbors = NeighborsView(adata, neighbors_key) if 'params' not in neighbors or neighbors['params']['method'] != 'umap': logg.warning( f'.obsp["{neighbors["connectivities_key"]}"] have not been computed using umap' ) # Compat for umap 0.4 -> 0.5 with warnings.catch_warnings(): # umap 0.5.0 warnings.filterwarnings("ignore", message=r"Tensorflow not installed") import umap if version.parse(umap.__version__) >= version.parse("0.5.0"): def simplicial_set_embedding(*args, **kwargs): from umap.umap_ import simplicial_set_embedding X_umap, _ = simplicial_set_embedding( *args, densmap=False, densmap_kwds={}, output_dens=False, **kwargs, ) return X_umap else: from umap.umap_ import simplicial_set_embedding from umap.umap_ import find_ab_params if a is None or b is None: a, b = find_ab_params(spread, min_dist) else: a = a b = b adata.uns['umap'] = {'params': {'a': a, 'b': b}} if isinstance(init_pos, str) and init_pos in adata.obsm.keys(): init_coords = adata.obsm[init_pos] elif isinstance(init_pos, str) and init_pos == 'paga': init_coords = get_init_pos_from_paga( adata, random_state=random_state, neighbors_key=neighbors_key ) else: init_coords = init_pos # Let umap handle it if hasattr(init_coords, "dtype"): init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False) if random_state != 0: adata.uns['umap']['params']['random_state'] = random_state random_state = check_random_state(random_state) neigh_params = neighbors['params'] X = _choose_representation( adata, neigh_params.get('use_rep', None), neigh_params.get('n_pcs', None), silent=True, ) if method == 'umap': # the data matrix X is really only used for determining the number of connected components # for the init condition in the UMAP embedding n_epochs = 0 if maxiter is None else maxiter X_umap = simplicial_set_embedding( X, neighbors['connectivities'].tocoo(), n_components, alpha, a, b, gamma, negative_sample_rate, n_epochs, init_coords, random_state, neigh_params.get('metric', 'euclidean'), neigh_params.get('metric_kwds', {}), verbose=settings.verbosity > 3, ) elif method == 'rapids': metric = neigh_params.get('metric', 'euclidean') if metric != 'euclidean': raise ValueError( f'`sc.pp.neighbors` was called with `metric` {metric!r}, ' "but umap `method` 'rapids' only supports the 'euclidean' metric." ) from cuml import UMAP n_neighbors = neighbors['params']['n_neighbors'] n_epochs = ( 500 if maxiter is None else maxiter ) # 0 is not a valid value for rapids, unlike original umap X_contiguous = np.ascontiguousarray(X, dtype=np.float32) umap = UMAP( n_neighbors=n_neighbors, n_components=n_components, n_epochs=n_epochs, learning_rate=alpha, init=init_pos, min_dist=min_dist, spread=spread, negative_sample_rate=negative_sample_rate, a=a, b=b, verbose=settings.verbosity > 3, random_state=random_state, ) X_umap = umap.fit_transform(X_contiguous) adata.obsm['X_umap'] = X_umap # annotate samples with UMAP coordinates logg.info( ' finished', time=start, deep=('added\n' " 'X_umap', UMAP coordinates (adata.obsm)"), ) return adata if copy else None
def main(args=None): if args is None: args = sys.argv[1:] parser = argparse.ArgumentParser(description="UMAP encoding script") parser.add_argument( '--run-name', dest='run_name', help= "Training run directory (for the plot to be placed in the right logs directory)", required=True) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-f', '--file-path', dest='encodings_file_path', help="encodings file path", default=argparse.SUPPRESS) group.add_argument( '--files-all', dest='files_all', action='store_true', help="cluster using encodings of all available training stages", default=argparse.SUPPRESS) args = parser.parse_args(args) assert (('encodings_file_path' in args.__dict__.keys()) or ('files_all' in args.__dict__.keys())) if 'encodings_file_path' in args.__dict__.keys(): assert DoesPathExistAndIsFile( args.encodings_file_path ), f'path {args.encodings_file_path} does not exist or is not a file' curr_run_name = args.run_name logs_dir = os.path.join(os.path.abspath('./'), 'logs', curr_run_name) encodings_dir = os.path.join(logs_dir, 'data_encodings') umap_plots_dir = os.path.join(logs_dir, 'umap_plots') assert DoesPathExistAndIsDirectory( logs_dir), f'path {logs_dir} does not exist or not a directory' assert DoesPathExistAndIsDirectory( encodings_dir ), f'path {encodings_dir} does not exist or not a directory' EnsureDirectoryExists(umap_plots_dir) if 'encodings_file_path' in args.__dict__.keys(): encodings_files_paths = [args.encodings_file_path] elif (('files_all' in args.__dict__.keys()) and (args.files_all)): encodings_files_paths = find_files(encodings_dir, '*encoded.npz') encodings_files_paths.sort() for encodings_file_path in tqdm(encodings_files_paths): enc = np.load(encodings_file_path) zn = enc['zn'] zc_logits = enc['zc_logits'] labels = enc['labels'] # print('zn shape: ', zn.shape) # print('zc logits shape: ', zc_logits.shape) # print('true labels shape:', labels.shape) umap = UMAP(n_components=2, verbose=False, n_epochs=4096, learning_rate=0.1) umap_enc = umap.fit_transform(zn) classes = np.unique(labels) colors = cm.tab20(np.linspace(0.0, 1.0, len(classes))) colors = dict(zip(classes, colors)) f = plt.figure(figsize=(6, 6), dpi=300) for label in classes: plt.scatter(umap_enc[labels == label, 0], umap_enc[labels == label, 1], s=0.5, color=colors[label], label=label) lgnd = plt.legend(fontsize=5) for hndl in lgnd.legendHandles: hndl.set_sizes([20]) plt.axis('equal') plt.tight_layout() figname = os.path.join( umap_plots_dir, 'umap-%s.png' % os.path.splitext(os.path.basename(encodings_file_path))[0]) f.savefig( figname, dpi=300, bbox_inches=0, pad_inches=0, ) plt.close()