Python UMAP 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cuml

클래스/타입: UMAP

hotexamples.com에서의 예제들: 8

Python UMAP - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cuml.UMAP에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

UMAP(7)

fit_transform(5)

reset_index(1)

자주 사용되는 메소드들

UMAP (7)

fit_transform (5)

reset_index (1)

예제 #1

파일 보기

def supervised_umap(ax, manifolds, y, cutoff=50):
    x = manifolds[2].reset_index(drop=True)
    label = y['risk_score']
    c = label < cutoff

    m = UMAP(target_metric = "categorical").fit_transform(x, c)
    m = m.reset_index(drop=True).to_pandas()
    c = c.reset_index(drop=True).to_pandas()

    _=ax.scatter(m.loc[c].values[:,0],m[c].values[:,1],alpha=1,s=3,c='blue',label='Low Patient Risk Score (< {})'.format(cutoff))
    _=ax.scatter(m.loc[~c].values[:,0],m.loc[~c].values[:,1],alpha=1,s=3,c='red',label='High Patient Risk Score (> {})'.format(cutoff))
    ax.set_title('cutoff = {}%'.format(cutoff))

예제 #2

파일 보기

def umap(adata,
         min_dist=0.5,
         spread=1.0,
         n_components=2,
         maxiter=None,
         alpha=1.0,
         gamma=1.0,
         negative_sample_rate=5,
         init_pos='spectral',
         random_state=0,
         a=None,
         b=None,
         copy=False,
         method='umap'):
    """Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    min_dist : `float`, optional (default: 0.5)
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread : `float` (optional, default 1.0)
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components : `int`, optional (default: 2)
        The number of dimensions of the embedding.
    maxiter : `int`, optional (default: `None`)
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha : `float`, optional (default: 1.0)
        The initial learning rate for the embedding optimization.
    gamma : `float` (optional, default 1.0)
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate : `int` (optional, default 5)
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos : `string` or `np.array`, optional (default: 'spectral')
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP.
        Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state : `int`, `RandomState` or `None`, optional (default: 0)
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a : `float` (optional, default `None`)
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b : `float` (optional, default `None`)
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy : `bool` (default: `False`)
        Return a copy instead of writing to adata.
    method : {`'umap'`, `'rapids'`}  (default: `'umap'`)
        Use the original 'umap' implementation, or 'rapids' (experimental, GPU only)

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata
    if 'neighbors' not in adata.uns:
        raise ValueError(
            'Did not find \'neighbors/connectivities\'. Run `sc.pp.neighbors` first.'
        )
    start = logg.info('computing UMAP')
    if ('params' not in adata.uns['neighbors']
            or adata.uns['neighbors']['params']['method'] != 'umap'):
        logg.warning(
            'neighbors/connectivities have not been computed using umap')
    from umap.umap_ import find_ab_params, simplicial_set_embedding
    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b

    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(adata, random_state=random_state)
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords,
                                  dtype=np.float32,
                                  accept_sparse=False)

    random_state = check_random_state(random_state)
    neigh_params = adata.uns['neighbors']['params']
    X = _choose_representation(adata,
                               neigh_params.get('use_rep', None),
                               neigh_params.get('n_pcs', None),
                               silent=True)
    if method == 'umap':
        # the data matrix X is really only used for determining the number of connected components
        # for the init condition in the UMAP embedding
        n_epochs = 0 if maxiter is None else maxiter
        X_umap = simplicial_set_embedding(
            X,
            adata.uns['neighbors']['connectivities'].tocoo(),
            n_components,
            alpha,
            a,
            b,
            gamma,
            negative_sample_rate,
            n_epochs,
            init_coords,
            random_state,
            neigh_params.get('metric', 'euclidean'),
            neigh_params.get('metric_kwds', {}),
            verbose=settings.verbosity > 3,
        )
    elif method == 'rapids':
        metric = neigh_params.get('metric', 'euclidean')
        if metric != 'euclidean':
            raise ValueError(
                f'`sc.pp.neighbors` was called with `metric` {metric!r}, '
                "but umap `method` 'rapids' only supports the 'euclidean' metric."
            )
        from cuml import UMAP
        n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
        n_epochs = 500 if maxiter is None else maxiter  # 0 is not a valid value for rapids, unlike original umap
        X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
        umap = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            n_epochs=n_epochs,
            learning_rate=alpha,
            init=init_pos,
            min_dist=min_dist,
            spread=spread,
            negative_sample_rate=negative_sample_rate,
            a=a,
            b=b,
            verbose=settings.verbosity > 3,
        )
        X_umap = umap.fit_transform(X_contiguous)
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              "    'X_umap', UMAP coordinates (adata.obsm)"),
    )
    return adata if copy else None

예제 #3

파일 보기

    def __init__(self, df, n_clusters, chembl_ids):
        self.app = dash.Dash(
            __name__, external_stylesheets=external_stylesheets)
        self.df = df
        self.n_clusters = n_clusters
        self.chembl_ids = chembl_ids

        # Fetch relavant properties from database.
        self.prop_df = self.create_dataframe_molecule_properties(chembl_ids)

        self.df['chembl_id'] = chembl_ids
        self.df['id'] = self.df.index
        self.orig_df = df.copy()

        # initialize UMAP
        self.umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)

        # Construct the UI
        self.app.layout = self.constuct_layout()

        # Register callbacks for selection inside main figure
        self.app.callback(
            [Output('selected_clusters', 'value'),
             Output('selected_point_cnt', 'children')],
            [Input('main-figure', 'clickData'),
             Input('main-figure', 'selectedData'),
             Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('northstar_cluster', 'children')],
            [State("selected_clusters", "value")]) (self.handle_data_selection)

        # Register callbacks for buttons for reclustering selected data
        self.app.callback(
            [Output('main-figure', 'figure'),
             Output('northstar_cluster', 'children')],
            [Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('bt_north_star', 'n_clicks'),
             Input('north_star', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input('sl_nclusters', 'value')],
            [State("selected_clusters", "value"),
             State("main-figure", "selectedData")]) (self.handle_re_cluster)

        # Register callbacks for selection inside main figure to update module details
        self.app.callback(
            [Output('tb_selected_molecules', 'children'),
             Output('sl_mol_props', 'options'),
             Output("current_page", "children"),
             Output("total_page", "children"),
             Output('section_molecule_details', 'style')],
            [Input('main-figure', 'selectedData'),
             Input('sl_mol_props', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input("bt_page_prev", "n_clicks"),
             Input("bt_page_next", "n_clicks")],
             State("current_page", "children")) (self.handle_molecule_selection)

        self.app.callback(
            Output("hidden1", "children"),
            [Input("bt_reset", "n_clicks")]) (self.handle_reset)

        self.app.callback(
            Output('north_star', 'value'),
            [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')],
            State('north_star', 'value')) \
                (self.handle_mark_north_star)

예제 #4

파일 보기

class ChemVisualization:

    def __init__(self, df, n_clusters, chembl_ids):
        self.app = dash.Dash(
            __name__, external_stylesheets=external_stylesheets)
        self.df = df
        self.n_clusters = n_clusters
        self.chembl_ids = chembl_ids

        # Fetch relavant properties from database.
        self.prop_df = self.create_dataframe_molecule_properties(chembl_ids)

        self.df['chembl_id'] = chembl_ids
        self.df['id'] = self.df.index
        self.orig_df = df.copy()

        # initialize UMAP
        self.umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)

        # Construct the UI
        self.app.layout = self.constuct_layout()

        # Register callbacks for selection inside main figure
        self.app.callback(
            [Output('selected_clusters', 'value'),
             Output('selected_point_cnt', 'children')],
            [Input('main-figure', 'clickData'),
             Input('main-figure', 'selectedData'),
             Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('northstar_cluster', 'children')],
            [State("selected_clusters", "value")]) (self.handle_data_selection)

        # Register callbacks for buttons for reclustering selected data
        self.app.callback(
            [Output('main-figure', 'figure'),
             Output('northstar_cluster', 'children')],
            [Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('bt_north_star', 'n_clicks'),
             Input('north_star', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input('sl_nclusters', 'value')],
            [State("selected_clusters", "value"),
             State("main-figure", "selectedData")]) (self.handle_re_cluster)

        # Register callbacks for selection inside main figure to update module details
        self.app.callback(
            [Output('tb_selected_molecules', 'children'),
             Output('sl_mol_props', 'options'),
             Output("current_page", "children"),
             Output("total_page", "children"),
             Output('section_molecule_details', 'style')],
            [Input('main-figure', 'selectedData'),
             Input('sl_mol_props', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input("bt_page_prev", "n_clicks"),
             Input("bt_page_next", "n_clicks")],
             State("current_page", "children")) (self.handle_molecule_selection)

        self.app.callback(
            Output("hidden1", "children"),
            [Input("bt_reset", "n_clicks")]) (self.handle_reset)

        self.app.callback(
            Output('north_star', 'value'),
            [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')],
            State('north_star', 'value')) \
                (self.handle_mark_north_star)

    def MorganFromSmiles(self, smiles, radius=2, nBits=512):
        m = Chem.MolFromSmiles(smiles)
        fp = AllChem.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=nBits)
        ar = cupy.array(fp)
        return ar

    def re_cluster(self, gdf, new_figerprints=None, new_chembl_ids=None):
        if gdf.shape[0] == 0:
            return None

        # Before reclustering remove all columns that may interfere  
        ids = gdf['id'] 
        chembl_ids = gdf['chembl_id']

        gdf.drop(['x', 'y', 'cluster', 'id', 'chembl_id'], inplace=True)
        if new_figerprints is not None and new_chembl_ids is not None:
            # Add new figerprints and chEmblIds before reclustering
            fp_df = cudf.DataFrame(new_figerprints, columns=gdf.columns)
            gdf = gdf.append(fp_df, ignore_index=True)
            chembl_ids = chembl_ids.append(
                cudf.Series(new_chembl_ids), ignore_index=True)

        kmeans_float = KMeans(n_clusters=self.n_clusters)
        kmeans_float.fit(gdf)

        Xt = self.umap.fit_transform(gdf)

        # Add back the column required for plotting and to correlating data 
        # between re-clustering 
        gdf.add_column('x', Xt[0].to_array())
        gdf.add_column('y', Xt[1].to_array())
        gdf.add_column('id', gdf.index)
        gdf.add_column('chembl_id', chembl_ids)
        gdf.add_column('cluster', kmeans_float.labels_.to_array())
        return gdf

    def recluster_nofilter(self, df, gradient_prop, north_stars=None):
        tdf = self.re_cluster(df)
        if tdf is not None:
            self.df = tdf
        return self.create_graph(self.df, color_col='cluster', 
            gradient_prop=gradient_prop, north_stars=north_stars)

    def recluster_selected_clusters(self, df, values, gradient_prop, north_stars=None):
        df_clusters = df['cluster'].isin(values)
        filters = df_clusters.values

        tdf = df[filters.get()]
        tdf = self.re_cluster(tdf)
        if tdf is not None:
            self.df = tdf
        return self.create_graph(self.df, color_col='cluster', 
            gradient_prop=gradient_prop, north_stars=north_stars)

    def recluster_selected_points(self, df, values, gradient_prop, north_stars=None):
        df_clusters = df['id'].isin(values)
        filters = df_clusters.values

        tdf = df[filters.get()]
        tdf = self.re_cluster(tdf)
        if tdf is not None:
            self.df = tdf
        return self.create_graph(self.df, color_col='cluster', 
            gradient_prop=gradient_prop, north_stars=north_stars)

    def create_graph(self, df, color_col='cluster', north_stars=None, gradient_prop=None):
        fig = go.Figure(layout = {'colorscale' : {}})
        ldf = df.merge(self.prop_df, on='chembl_id')

        cmin = cmax = None
        if gradient_prop is not None:
            cmin = ldf[gradient_prop].min()
            cmax = ldf[gradient_prop].max()

        north_points = []
        if north_stars:
            for chemblid in north_stars.split(","):
                chemblid = chemblid.strip()
                if chemblid in self.chembl_ids:
                    north_points.append(self.chembl_ids.index(chemblid))

        northstar_cluster = []
        for cluster_id in ldf[color_col].unique().values_host:
            query = 'cluster == ' + str(cluster_id)
            cdf = ldf.query(query)

            df_size = cdf['id'].isin(north_points)
            if df_size.unique().shape[0] > 1:
                northstar_cluster.append(str(cluster_id))

            # Compute size of northstar and normal points
            df_shape = df_size.copy()
            df_size = (df_size * 18) + 6
            df_shape = df_shape * 2
            if gradient_prop is not None:

                fig.add_trace(
                    go.Scattergl({
                        'x': cdf['x'].to_array(),
                        'y': cdf['y'].to_array(),
                        'text': cdf['chembl_id'].to_array(),
                        'customdata': cdf['id'].to_array(),
                        'name': 'Cluster ' + str(cluster_id),
                        'mode': 'markers',
                        'showlegend': False,
                        'marker': {
                            'size': df_size.to_array(),
                            'symbol': df_shape.to_array(),
                            'color': cdf[gradient_prop].to_array(),
                            'colorscale': 'Viridis',
                            'showscale': True,
                            'cmin': cmin,
                            'cmax': cmax,
                        }
                }))
            else:
                fig.add_trace(
                    go.Scattergl({
                        'x': cdf['x'].to_array(),
                        'y': cdf['y'].to_array(),
                        'text': cdf['chembl_id'].to_array(),
                        'customdata': cdf['id'].to_array(),
                        'name': 'Cluster ' + str(cluster_id),
                        'mode': 'markers',
                        'marker': {
                            'size': df_size.to_array(),
                            'symbol': df_shape.to_array(),
                        }
                }))

        fig.update_layout(
            showlegend=True, clickmode='event', height=main_fig_height, 
                title='Clusters', dragmode='select',
            annotations=[
                dict(x=0.5, y=-0.07, showarrow=False, text='x', 
                    xref="paper", yref="paper"),
                dict(x=-0.05, y=0.5, showarrow=False, text="y", 
                    textangle=-90, xref="paper", yref="paper")])
        
        del ldf
        return fig, northstar_cluster

    def start(self, host=None, port=5000):
        return self.app.run_server(
            debug=False, use_reloader=False, host=host, port=port)


    def href_ify(self, chemblid):
        return html.A(chemblid, href='https://www.ebi.ac.uk/chembl/compound_report_card/' + chemblid,
                      target='_blank')

    #TODO: remove self.selected_chembl_id
    def construct_molecule_detail(self, selected_points, display_properties, page, pageSize=10):
        # Create Table header
        table_headers = [html.Th("Molecular Structure", style={'width': '30%'}),
              html.Th("Chembl"),
              html.Th("smiles")]
        for prop in display_properties:
            table_headers.append(html.Th(prop))

        table_headers.append(html.Th(""))
        prop_recs = [html.Tr(table_headers)]

        selected_chembl_ids = []
        for point in selected_points['points'][((page-1)*pageSize + 1): page * pageSize]:
            selected_chembl_ids.append(point['text'])

        props, selected_molecules = self.fetch_molecule_properties(selected_chembl_ids)
        all_props = []
        for k in props:
            all_props.append({"label": k, "value": k})

        for selected_molecule in selected_molecules:
            td = []
            selected_chembl_id = selected_molecule[0]
            smiles = selected_molecule[props.index('canonical_smiles')]

            mol = selected_molecule[props.index('molfile')]
            m = Chem.MolFromMolBlock(mol)

            drawer = Draw.rdMolDraw2D.MolDraw2DCairo(400, 200)
            drawer.SetFontSize(1.0)
            drawer.DrawMolecule(m)
            drawer.FinishDrawing()

            img_binary="data:image/png;base64," + \
                base64.b64encode(drawer.GetDrawingText()).decode("utf-8")

            td.append(html.Img(src=img_binary))
            td.append(html.Td(self.href_ify(selected_chembl_id)))
            td.append(html.Td(smiles))
            for key in display_properties:
                td.append(html.Td(selected_molecule[props.index(key)]))
            td.append(html.Td(
                dbc.Button('Add to MoI', \
                    id={'role': 'bt_star_candidate', 'index': selected_chembl_id}, n_clicks=0)
            ))
            prop_recs.append(html.Tr(td))
        return  html.Table(prop_recs, style={'width': '100%'}), all_props

    def constuct_layout(self):
        fig, northstart_cluster = self.create_graph(self.df)

        return html.Div([
            html.Div(className='row', children=[
                html.Div([dcc.Graph(id='main-figure', figure=fig),], 
                    className='nine columns', 
                    style={'verticalAlign': 'text-top',}),
                html.Div([                    
                    html.Div(children=[
                        dcc.Markdown("""
                            **Molecule(s) of Interest (MoI)**

                            Please enter Chembl id."""), ]),
                    html.Div(className='row', children=[
                        dcc.Input(id='north_star', type='text', debounce=True),
                        dbc.Button('Highlight', 
                            id='bt_north_star', n_clicks=0,
                            style={'marginLeft': 6,}),
                        ], style={'marginLeft': 0, 'marginTop': 18,}),

                    html.Div(id='section_nclusters', children=[
                        html.Label([
                            "Set number of clusters",
                            dcc.Dropdown(id='sl_nclusters', 
                                         multi=False,
                                         options=[{"label": p, "value": p} for p in range(2,10)],
                                         value=self.n_clusters
                                        ),
                        ], style={'marginTop': 6})], 
                    ),

                    html.Div(children=[
                        dcc.Markdown("""
                            **Cluster Selection**

                            Click a point to select a cluster.
                        """)], style={'marginTop': 18,}),

                    html.Div(className='row', children=[
                        dcc.Input(id='selected_clusters', type='text'),
                        dbc.Button('Recluster', 
                            id='bt_recluster_clusters', n_clicks=0,
                            style={'marginLeft': 6,}),
                        ], style={'marginLeft': 0, 'marginTop': 18,}),

                    html.Div(children=[
                        dcc.Markdown("""
                            **Selection Points**

                            Choose the lasso or rectangle tool in the graph's menu
                            bar and then select points in the graph.
                        """),], style={'marginTop': 18,}),
                    dbc.Button('Recluster Selection', 
                        id='bt_recluster_points', n_clicks=0),
                    html.Div(children=[
                             html.Div(id='selected_point_cnt'),]),

                    html.Div(className='row', children=[
                        html.Div(children=[
                            dbc.Button("Close", id="bt_close"),
                            dbc.Modal([
                                    dbc.ModalHeader("Close"),
                                    dbc.ModalBody(
                                        dcc.Markdown("""
                                            Dashboard closed. Please return to the notebook.
                                        """),
                                    ),
                                    dbc.ModalFooter(dbc.Button("Close", id="bt_close_dash", className="ml-auto")),
                                ], id="md_export"),
                        ]),

                        html.Div(children=[html.A(dbc.Button('Reload', id='bt_reset'), href='/'),], 
                                style={'marginLeft': 18,}),
                    ], style={'marginLeft': 0, 'marginTop': 18,}),

                    html.Div(id='section_prop_gradient', children=[
                        html.Label([
                            "Select Molecular Property for color gradient",
                            dcc.Dropdown(id='sl_prop_gradient', multi=False,
                                options=[{"label": p, "value": p} for p in IMP_PROPS],),
                        ], style={'marginTop': 18})], 
                    ),

                ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top',}),
            ]),
            html.Div(id='section_molecule_details', className='row', children=[
                html.Div(className='row', children=[
                    html.Div(id='section_display_properties', children=[
                        html.Label([
                            "Select Molecular Properties",
                            dcc.Dropdown(id='sl_mol_props', multi=True,
                                options=[{'label': 'alogp', 'value': 'alogp'}],
                                value=['alogp']),
                        ], style={'marginLeft': 60})],
                        className='nine columns', 
                    ),
                    html.Div(children=[
                            dbc.Button("<", id="bt_page_prev", style={"height": "25px"}),
                            html.Span(children=1, id='current_page', style={"paddingLeft": "6px"}),
                            html.Span(children=' of 1', id='total_page', style={"paddingRight": "6px"}),
                            dbc.Button(">", id="bt_page_next", style={"height": "25px"})
                        ],
                        className='three columns',
                        style={'paddingRight': 60, 'verticalAlign': 'text-bottom', 'text-align': 'right'}
                    ),
                ]),

                html.Div(className='row', children=[
                    html.Div(id='tb_selected_molecules', children=[],
                        style={'marginLeft': 60, 'verticalAlign': 'text-top'}
                    ), 
                ])
            ], style={'display':'none'}),

            html.Div(id='hidden1', style={'display':'none'}),
            html.Div(id='northstar_cluster', style={'display':'none'})
        ])

    def handle_reset(self, recluster_nofilter):
        self.df = self.orig_df.copy()

    def handle_molecule_selection(self, mf_selected_data, selected_columns,
            sl_prop_gradient, prev_click, next_click, current_page):
        if not dash.callback_context.triggered or not mf_selected_data:
            raise dash.exceptions.PreventUpdate
        comp_id, event_type = \
            dash.callback_context.triggered[0]['prop_id'].split('.')

        module_details = None
        # Code to support pagination
        if comp_id == 'bt_page_prev' and event_type == 'n_clicks':
            if current_page == 1:
                raise dash.exceptions.PreventUpdate
            current_page -= 1
        elif comp_id == 'bt_page_next' and event_type == 'n_clicks':
            if len(mf_selected_data['points']) < PAGE_SIZE * (current_page + 1):
                raise dash.exceptions.PreventUpdate
            current_page += 1

        if selected_columns and sl_prop_gradient:
            if sl_prop_gradient not in selected_columns:
                selected_columns.append(sl_prop_gradient)

        module_details, all_props = self.construct_molecule_detail(
            mf_selected_data, selected_columns, current_page, pageSize=PAGE_SIZE)
        
        last_page = ' of ' + str(len(mf_selected_data['points'])//PAGE_SIZE)
        return module_details, all_props, current_page, last_page, {'display':'block'}

    def handle_data_selection(self, mf_click_data, mf_selected_data, 
                              bt_cluster_clicks, bt_point_clicks, 
                              northstar_cluster,
                              curr_clusters):
        if not dash.callback_context.triggered:
            raise dash.exceptions.PreventUpdate
        comp_id, event_type = \
            dash.callback_context.triggered[0]['prop_id'].split('.')
        selected_clusters = ''
        selected_point_cnt = ''

        if comp_id == 'main-figure' and event_type == 'clickData':
            # Event - On selecting cluster on the main scatter plot
            if not curr_clusters:
                clusters = []
            else:
                clusters = list(map(int, curr_clusters.split(","))) 

            points = mf_click_data['points']
            for point in points:
                cluster = point['curveNumber']
                if cluster in clusters:
                    clusters.remove(cluster)
                else:
                    clusters.append(cluster)
            selected_clusters = ','.join(map(str, clusters))
            
        elif comp_id == 'main-figure' and event_type == 'selectedData':
            # Event - On selection on the main scatterplot
            if not mf_selected_data:
                raise dash.exceptions.PreventUpdate
                
            points = mf_selected_data['points']
            selected_point_cnt = str(len(points)) + ' points selected'
            clusters = {point['curveNumber'] for point in points}
            # selected_clusters = ','.join(map(str, clusters))
            selected_clusters = northstar_cluster

        elif comp_id == 'northstar_cluster' and event_type == 'children':
            selected_clusters = northstar_cluster
        elif (comp_id == 'bt_recluster_clusters' and event_type == 'n_clicks') \
            or (comp_id == 'bt_recluster_points' and event_type == 'n_clicks'):
            selected_clusters = northstar_cluster
        else:
            raise dash.exceptions.PreventUpdate

        return selected_clusters, selected_point_cnt

    def handle_mark_north_star(self, bt_north_star_click, north_star):
        if not dash.callback_context.triggered:
            raise dash.exceptions.PreventUpdate

        comp_id, event_type = \
            dash.callback_context.triggered[0]['prop_id'].split('.')

        if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0:
            raise dash.exceptions.PreventUpdate

        if not north_star:
            selected_north_star = []
        else:
            selected_north_star = north_star.split(",") 

        comp_detail = json.loads(comp_id)
        selected_chembl_id = comp_detail['index']

        if selected_chembl_id not in selected_north_star and \
            selected_chembl_id in self.chembl_ids:
            selected_north_star.append(selected_chembl_id)

        return ','.join(selected_north_star)

    def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks,
                          north_star, sl_prop_gradient, sl_nclusters, curr_clusters, mf_selected_data):
        if not dash.callback_context.triggered:
            raise dash.exceptions.PreventUpdate

        comp_id, event_type = \
            dash.callback_context.triggered[0]['prop_id'].split('.')

        self.n_clusters = sl_nclusters

        if comp_id == 'bt_recluster_clusters' and event_type == 'n_clicks':
            if not curr_clusters:
                figure, northstar_cluster = self.recluster_nofilter(
                    self.df, sl_prop_gradient, north_stars=north_star)
            else:
                clusters = list(map(int, curr_clusters.split(",")))
                figure, northstar_cluster = self.recluster_selected_clusters(
                    self.df, clusters, sl_prop_gradient, north_stars=north_star)
            
        elif comp_id == 'bt_recluster_points' and event_type == 'n_clicks':
            if not mf_selected_data:
                figure, northstar_cluster = self.recluster_nofilter(
                    self.df, sl_prop_gradient, north_stars=north_star)
            else:
                points = []
                for point in mf_selected_data['points']:
                    points.append(point['customdata'])
                figure, northstar_cluster = self.recluster_selected_points(
                    self.df, points, sl_prop_gradient, north_stars=north_star)

        elif (comp_id == 'bt_north_star' and event_type == 'n_clicks') or \
            (comp_id == 'sl_prop_gradient' and event_type == 'value'):

            figure, northstar_cluster = self.create_graph(
                self.df, gradient_prop=sl_prop_gradient, north_stars=north_star)
        
        elif (comp_id == 'north_star'  and event_type == 'value'):
            north_star = self.update_new_chembl(north_star)
            if north_star:
                figure, northstar_cluster = self.create_graph(
                    self.df, gradient_prop=sl_prop_gradient, north_stars=north_star)
            else:
                raise dash.exceptions.PreventUpdate
        else:
            raise dash.exceptions.PreventUpdate

        return figure, ','.join(northstar_cluster)

    def update_new_chembl(self, north_stars, radius=2, nBits=512):
        north_stars = list(map(str.strip, north_stars.split(',')))
        north_stars = list(map(str.upper, north_stars))
        missing_chembl = set(north_stars).difference(self.chembl_ids)
        
        # CHEMBL10307, CHEMBL103071, CHEMBL103072
        if missing_chembl:
            missing_chembl = list(missing_chembl)
            ldf = self.create_dataframe_molecule_properties(missing_chembl)

            if ldf.shape[0] > 0:
                self.prop_df = self.prop_df.append(ldf)
                self.chembl_ids.extend(missing_chembl)
                
                smiles = []
                for i in range(0, ldf.shape[0]):
                    smiles.append(ldf.iloc[i]['canonical_smiles'].to_array()[0])
                results = list(map(self.MorganFromSmiles, smiles))
                fingerprints = cupy.stack(results).astype(np.float32)
                tdf = self.re_cluster(self.df, fingerprints, missing_chembl)
                if tdf is not None:
                    self.df = tdf
                else:
                    return None
        return ','.join(north_stars)

    def fetch_molecule_properties(self, chemblIDs):
        with closing(sqlite3.connect(CHEMBL_DB)) as con, con,  \
            closing(con.cursor()) as cur:
            select_stmt = '''
                SELECT md.chembl_id, cp.*, cs.*
                FROM compound_properties cp, compound_structures cs, molecule_dictionary md 
                WHERE cp.molregno = md.molregno 
                    AND md.molregno = cs.molregno
                    AND md.chembl_id in (%s);
            ''' % "'%s'" %"','".join(chemblIDs)
            cur.execute(select_stmt)
            cols = list(map(lambda x: x[0], cur.description))
            return cols, cur.fetchall()

    def create_dataframe_molecule_properties(self, chemblIDs):
        with closing(sqlite3.connect(CHEMBL_DB)) as con, con,  \
            closing(con.cursor()) as cur:
            select_stmt = '''
                SELECT md.chembl_id, cp.*, cs.*
                FROM compound_properties cp, molecule_dictionary md, compound_structures cs
                WHERE cp.molregno = md.molregno 
                    AND md.molregno = cs.molregno
                    AND md.chembl_id in (%s);
            ''' % "'%s'" %"','".join(chemblIDs)

            df = cudf.from_pandas(pd.read_sql(select_stmt, con))
            return df.sort_values('chembl_id')

예제 #5

파일 보기

    logger.info('Initializing Morgan fingerprints...')
    results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute()

    np_array_fingerprints = np.stack(results).astype(np.float32)

    # take np.array shape (n_mols, nBits) for GPU DataFrame
    gdf = np2cudf(np_array_fingerprints)

    # prepare one set of clusters
    n_clusters = 7
    kmeans_float = KMeans(n_clusters=n_clusters)
    kmeans_float.fit(gdf)
    
    # UMAP
    umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)
    Xt = umap.fit_transform(gdf)
    gdf.add_column('x', Xt[0].to_array())
    gdf.add_column('y', Xt[1].to_array())

    gdf.add_column('cluster', kmeans_float.labels_)

    # start dash
    v = chemvisualize.ChemVisualization(
        gdf.copy(), n_clusters, chemblID_list)

    logger.info('navigate to https://localhost:5000')
    v.start('0.0.0.0')

예제 #6

파일 보기

파일: visualize_minor.py 프로젝트: songheony/TAkS

def draw_sampels(model, dataset, original_labels, algorithm, log_dir, seed,
                 ind_path, save_dir):
    if isinstance(log_dir, str):
        log_dir = Path(log_dir)
    if isinstance(save_dir, str):
        save_dir = Path(save_dir)

    os.makedirs(save_dir, exist_ok=True)

    algorithm_name = get_algorithms_name(log_dir, [algorithm])[0]
    model_dir = log_dir / algorithm_name / str(seed) / "model0"

    idx_path = model_dir / "selected_idx.pkl"
    selected_idxs = pickle.loads(Path(idx_path).read_bytes())

    pt_path = model_dir / "best_model.pt"
    model.load_state_dict(torch.load(pt_path))

    noise_ind = set(np.load(ind_path))
    selected_idx = set(selected_idxs[-1])

    labels = np.array(original_labels)

    fixed_train_dataloader = DataLoader(dataset,
                                        batch_size=256,
                                        shuffle=False,
                                        num_workers=8)
    model.eval()

    X = []
    with torch.no_grad():
        for i, (images, _, _) in enumerate(fixed_train_dataloader):
            if torch.cuda.is_available():
                images = images.to("cuda:0")
            output = model(images)
            output = output.cpu().numpy()
            X.append(output)

    X = np.concatenate(X, axis=0)

    tsne = UMAP(n_components=2, random_state=0)
    embedding = tsne.fit_transform(X)

    emb1 = embedding[:, 0]
    emb2 = embedding[:, 1]

    targets = ["clean", "noise", "select", "nonsel"]
    multi_fig = make_subplots(rows=2,
                              cols=2,
                              vertical_spacing=0.05,
                              horizontal_spacing=0.02)

    min_x, max_x = [], []
    min_y, max_y = [], []

    for n, target in enumerate(targets):
        fig = go.Figure()
        class_min_x, class_max_x = np.inf, 0
        class_min_y, class_max_y = np.inf, 0
        for i in range(len(dataset.classes)):
            class_ind = set(np.where(labels == i)[0])

            if target == "noise":
                target_ind = class_ind & noise_ind
            elif target == "clean":
                target_ind = class_ind - noise_ind
            elif target == "select":
                target_ind = class_ind & selected_idx
            elif target == "nonsel":
                target_ind = class_ind - selected_idx

            X = emb1[list(target_ind)]
            Y = emb2[list(target_ind)]

            if len(target_ind) > 0:
                class_min_x = min(min(X), class_min_x)
                class_max_x = max(max(X), class_max_x)
                class_min_y = min(min(Y), class_min_y)
                class_max_y = max(max(Y), class_max_y)

            if len(dataset.classes) <= 10:
                scatter = go.Scattergl(x=X, y=Y, mode="markers")
            else:
                color = hsv_to_rgb(360 / len(dataset.classes) * i, 1, 1)
                scatter = go.Scattergl(
                    x=X,
                    y=Y,
                    mode="markers",
                    marker_color=f"rgb({color[0]},{color[1]},{color[2]})",
                )

            fig.add_trace(scatter)

            multi_fig.add_trace(
                scatter,
                row=n // 2 + 1,
                col=n % 2 + 1,
            )
        min_x.append(class_min_x)
        max_x.append(class_max_x)
        min_y.append(class_min_y)
        max_y.append(class_max_y)
        fig.update_layout(
            autosize=False,
            width=1200,
            height=800,
            margin=go.layout.Margin(
                l=0,  # left margin
                r=0,  # right margin
                b=0,  # bottom margin
                t=0,  # top margin
            ),
            showlegend=False,
        )
        fig.update_xaxes(showticklabels=False)
        fig.update_yaxes(showticklabels=False)
        # fig.write_html(f"{save_dir}/umap-{target}.html")
        fig.write_image(f"{save_dir}/umap-{target}.pdf")

    multi_fig.update_layout(
        autosize=False,
        width=3200,
        height=2400,
        margin=go.layout.Margin(
            l=0,  # left margin
            r=0,  # right margin
            b=100,  # bottom margin
            t=0,  # top margin
        ),
        font=dict(family="Arial", size=84),
        showlegend=False,
    )
    multi_fig.update_xaxes(title_text="Clean samples", row=1, col=1)
    multi_fig.update_xaxes(title_text="Incorrectly labeled samples",
                           row=1,
                           col=2)
    multi_fig.update_xaxes(title_text="Selected samples", row=2, col=1)
    multi_fig.update_xaxes(title_text="Non-selected samples", row=2, col=2)
    # scaling
    multi_fig.update_xaxes(showticklabels=False,
                           range=[max(min_x), min(max_x)])
    multi_fig.update_yaxes(showticklabels=False,
                           range=[max(min_y), min(max_y)])
    # multi_fig.write_html(f"{save_dir}/umap.html")
    multi_fig.write_image(f"{save_dir}/umap.pdf")

예제 #7

파일 보기

파일: _umap.py 프로젝트: zwdiscover/scanpy

def umap(
    adata: AnnData,
    min_dist: float = 0.5,
    spread: float = 1.0,
    n_components: int = 2,
    maxiter: Optional[int] = None,
    alpha: float = 1.0,
    gamma: float = 1.0,
    negative_sample_rate: int = 5,
    init_pos: Union[_InitPos, np.ndarray, None] = 'spectral',
    random_state: AnyRandom = 0,
    a: Optional[float] = None,
    b: Optional[float] = None,
    copy: bool = False,
    method: Literal['umap', 'rapids'] = 'umap',
    neighbors_key: Optional[str] = None,
) -> Optional[AnnData]:
    """\
    Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata
        Annotated data matrix.
    min_dist
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components
        The number of dimensions of the embedding.
    maxiter
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha
        The initial learning rate for the embedding optimization.
    gamma
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP. Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState` or `Generator`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy
        Return a copy instead of writing to adata.
    method
        Use the original 'umap' implementation, or 'rapids' (experimental, GPU only)
    neighbors_key
        If not specified, umap looks .uns['neighbors'] for neighbors settings
        and .obsp['connectivities'] for connectivities
        (default storage places for pp.neighbors).
        If specified, umap looks .uns[neighbors_key] for neighbors settings and
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata

    if neighbors_key is None:
        neighbors_key = 'neighbors'

    if neighbors_key not in adata.uns:
        raise ValueError(
            f'Did not find .uns["{neighbors_key}"]. Run `sc.pp.neighbors` first.'
        )
    start = logg.info('computing UMAP')

    neighbors = NeighborsView(adata, neighbors_key)

    if 'params' not in neighbors or neighbors['params']['method'] != 'umap':
        logg.warning(
            f'.obsp["{neighbors["connectivities_key"]}"] have not been computed using umap'
        )

    # Compat for umap 0.4 -> 0.5
    with warnings.catch_warnings():
        # umap 0.5.0
        warnings.filterwarnings("ignore", message=r"Tensorflow not installed")
        import umap

    if version.parse(umap.__version__) >= version.parse("0.5.0"):

        def simplicial_set_embedding(*args, **kwargs):
            from umap.umap_ import simplicial_set_embedding

            X_umap, _ = simplicial_set_embedding(
                *args,
                densmap=False,
                densmap_kwds={},
                output_dens=False,
                **kwargs,
            )
            return X_umap

    else:
        from umap.umap_ import simplicial_set_embedding
    from umap.umap_ import find_ab_params

    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b
    adata.uns['umap'] = {'params': {'a': a, 'b': b}}
    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(
            adata, random_state=random_state, neighbors_key=neighbors_key
        )
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False)

    if random_state != 0:
        adata.uns['umap']['params']['random_state'] = random_state
    random_state = check_random_state(random_state)

    neigh_params = neighbors['params']
    X = _choose_representation(
        adata,
        neigh_params.get('use_rep', None),
        neigh_params.get('n_pcs', None),
        silent=True,
    )
    if method == 'umap':
        # the data matrix X is really only used for determining the number of connected components
        # for the init condition in the UMAP embedding
        n_epochs = 0 if maxiter is None else maxiter
        X_umap = simplicial_set_embedding(
            X,
            neighbors['connectivities'].tocoo(),
            n_components,
            alpha,
            a,
            b,
            gamma,
            negative_sample_rate,
            n_epochs,
            init_coords,
            random_state,
            neigh_params.get('metric', 'euclidean'),
            neigh_params.get('metric_kwds', {}),
            verbose=settings.verbosity > 3,
        )
    elif method == 'rapids':
        metric = neigh_params.get('metric', 'euclidean')
        if metric != 'euclidean':
            raise ValueError(
                f'`sc.pp.neighbors` was called with `metric` {metric!r}, '
                "but umap `method` 'rapids' only supports the 'euclidean' metric."
            )
        from cuml import UMAP

        n_neighbors = neighbors['params']['n_neighbors']
        n_epochs = (
            500 if maxiter is None else maxiter
        )  # 0 is not a valid value for rapids, unlike original umap
        X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
        umap = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            n_epochs=n_epochs,
            learning_rate=alpha,
            init=init_pos,
            min_dist=min_dist,
            spread=spread,
            negative_sample_rate=negative_sample_rate,
            a=a,
            b=b,
            verbose=settings.verbosity > 3,
            random_state=random_state,
        )
        X_umap = umap.fit_transform(X_contiguous)
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n' "    'X_umap', UMAP coordinates (adata.obsm)"),
    )
    return adata if copy else None

예제 #8

파일 보기

def main(args=None):
    if args is None:
        args = sys.argv[1:]

    parser = argparse.ArgumentParser(description="UMAP encoding script")
    parser.add_argument(
        '--run-name',
        dest='run_name',
        help=
        "Training run directory (for the plot to be placed in the right logs directory)",
        required=True)

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-f',
                       '--file-path',
                       dest='encodings_file_path',
                       help="encodings file path",
                       default=argparse.SUPPRESS)
    group.add_argument(
        '--files-all',
        dest='files_all',
        action='store_true',
        help="cluster using encodings of all available training stages",
        default=argparse.SUPPRESS)

    args = parser.parse_args(args)

    assert (('encodings_file_path' in args.__dict__.keys())
            or ('files_all' in args.__dict__.keys()))
    if 'encodings_file_path' in args.__dict__.keys():
        assert DoesPathExistAndIsFile(
            args.encodings_file_path
        ), f'path {args.encodings_file_path} does not exist or is not a file'

    curr_run_name = args.run_name
    logs_dir = os.path.join(os.path.abspath('./'), 'logs', curr_run_name)
    encodings_dir = os.path.join(logs_dir, 'data_encodings')
    umap_plots_dir = os.path.join(logs_dir, 'umap_plots')
    assert DoesPathExistAndIsDirectory(
        logs_dir), f'path {logs_dir} does not exist or not a directory'
    assert DoesPathExistAndIsDirectory(
        encodings_dir
    ), f'path {encodings_dir} does not exist or not a directory'
    EnsureDirectoryExists(umap_plots_dir)

    if 'encodings_file_path' in args.__dict__.keys():
        encodings_files_paths = [args.encodings_file_path]
    elif (('files_all' in args.__dict__.keys()) and (args.files_all)):
        encodings_files_paths = find_files(encodings_dir, '*encoded.npz')
        encodings_files_paths.sort()

    for encodings_file_path in tqdm(encodings_files_paths):
        enc = np.load(encodings_file_path)
        zn = enc['zn']
        zc_logits = enc['zc_logits']
        labels = enc['labels']

        # print('zn shape: ', zn.shape)
        # print('zc logits shape: ', zc_logits.shape)
        # print('true labels shape:', labels.shape)

        umap = UMAP(n_components=2,
                    verbose=False,
                    n_epochs=4096,
                    learning_rate=0.1)
        umap_enc = umap.fit_transform(zn)
        classes = np.unique(labels)
        colors = cm.tab20(np.linspace(0.0, 1.0, len(classes)))
        colors = dict(zip(classes, colors))

        f = plt.figure(figsize=(6, 6), dpi=300)
        for label in classes:
            plt.scatter(umap_enc[labels == label, 0],
                        umap_enc[labels == label, 1],
                        s=0.5,
                        color=colors[label],
                        label=label)
        lgnd = plt.legend(fontsize=5)
        for hndl in lgnd.legendHandles:
            hndl.set_sizes([20])
        plt.axis('equal')
        plt.tight_layout()

        figname = os.path.join(
            umap_plots_dir, 'umap-%s.png' %
            os.path.splitext(os.path.basename(encodings_file_path))[0])
        f.savefig(
            figname,
            dpi=300,
            bbox_inches=0,
            pad_inches=0,
        )
        plt.close()