Python UMAP.UMAP примеры использования

Язык программирования: Python

Пространство имен/Пакет: cuml

Класс/Тип: UMAP

Метод/Функция: UMAP

Примеров на hotexamples.com: 7

Python UMAP.UMAP - 7 примеров найдено. Это лучшие примеры Python кода для cuml.UMAP.UMAP, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

UMAP(7)

fit_transform(5)

reset_index(1)

Основные методы

UMAP (7)

fit_transform (5)

reset_index (1)

Пример #1

Показать файл

def supervised_umap(ax, manifolds, y, cutoff=50):
    x = manifolds[2].reset_index(drop=True)
    label = y['risk_score']
    c = label < cutoff

    m = UMAP(target_metric = "categorical").fit_transform(x, c)
    m = m.reset_index(drop=True).to_pandas()
    c = c.reset_index(drop=True).to_pandas()

    _=ax.scatter(m.loc[c].values[:,0],m[c].values[:,1],alpha=1,s=3,c='blue',label='Low Patient Risk Score (< {})'.format(cutoff))
    _=ax.scatter(m.loc[~c].values[:,0],m.loc[~c].values[:,1],alpha=1,s=3,c='red',label='High Patient Risk Score (> {})'.format(cutoff))
    ax.set_title('cutoff = {}%'.format(cutoff))

Пример #2

Показать файл

def umap(adata,
         min_dist=0.5,
         spread=1.0,
         n_components=2,
         maxiter=None,
         alpha=1.0,
         gamma=1.0,
         negative_sample_rate=5,
         init_pos='spectral',
         random_state=0,
         a=None,
         b=None,
         copy=False,
         method='umap'):
    """Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    min_dist : `float`, optional (default: 0.5)
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread : `float` (optional, default 1.0)
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components : `int`, optional (default: 2)
        The number of dimensions of the embedding.
    maxiter : `int`, optional (default: `None`)
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha : `float`, optional (default: 1.0)
        The initial learning rate for the embedding optimization.
    gamma : `float` (optional, default 1.0)
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate : `int` (optional, default 5)
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos : `string` or `np.array`, optional (default: 'spectral')
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP.
        Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state : `int`, `RandomState` or `None`, optional (default: 0)
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a : `float` (optional, default `None`)
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b : `float` (optional, default `None`)
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy : `bool` (default: `False`)
        Return a copy instead of writing to adata.
    method : {`'umap'`, `'rapids'`}  (default: `'umap'`)
        Use the original 'umap' implementation, or 'rapids' (experimental, GPU only)

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata
    if 'neighbors' not in adata.uns:
        raise ValueError(
            'Did not find \'neighbors/connectivities\'. Run `sc.pp.neighbors` first.'
        )
    start = logg.info('computing UMAP')
    if ('params' not in adata.uns['neighbors']
            or adata.uns['neighbors']['params']['method'] != 'umap'):
        logg.warning(
            'neighbors/connectivities have not been computed using umap')
    from umap.umap_ import find_ab_params, simplicial_set_embedding
    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b

    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(adata, random_state=random_state)
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords,
                                  dtype=np.float32,
                                  accept_sparse=False)

    random_state = check_random_state(random_state)
    neigh_params = adata.uns['neighbors']['params']
    X = _choose_representation(adata,
                               neigh_params.get('use_rep', None),
                               neigh_params.get('n_pcs', None),
                               silent=True)
    if method == 'umap':
        # the data matrix X is really only used for determining the number of connected components
        # for the init condition in the UMAP embedding
        n_epochs = 0 if maxiter is None else maxiter
        X_umap = simplicial_set_embedding(
            X,
            adata.uns['neighbors']['connectivities'].tocoo(),
            n_components,
            alpha,
            a,
            b,
            gamma,
            negative_sample_rate,
            n_epochs,
            init_coords,
            random_state,
            neigh_params.get('metric', 'euclidean'),
            neigh_params.get('metric_kwds', {}),
            verbose=settings.verbosity > 3,
        )
    elif method == 'rapids':
        metric = neigh_params.get('metric', 'euclidean')
        if metric != 'euclidean':
            raise ValueError(
                f'`sc.pp.neighbors` was called with `metric` {metric!r}, '
                "but umap `method` 'rapids' only supports the 'euclidean' metric."
            )
        from cuml import UMAP
        n_neighbors = adata.uns['neighbors']['params']['n_neighbors']
        n_epochs = 500 if maxiter is None else maxiter  # 0 is not a valid value for rapids, unlike original umap
        X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
        umap = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            n_epochs=n_epochs,
            learning_rate=alpha,
            init=init_pos,
            min_dist=min_dist,
            spread=spread,
            negative_sample_rate=negative_sample_rate,
            a=a,
            b=b,
            verbose=settings.verbosity > 3,
        )
        X_umap = umap.fit_transform(X_contiguous)
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              "    'X_umap', UMAP coordinates (adata.obsm)"),
    )
    return adata if copy else None

Пример #3

Показать файл

    def __init__(self, df, n_clusters, chembl_ids):
        self.app = dash.Dash(
            __name__, external_stylesheets=external_stylesheets)
        self.df = df
        self.n_clusters = n_clusters
        self.chembl_ids = chembl_ids

        # Fetch relavant properties from database.
        self.prop_df = self.create_dataframe_molecule_properties(chembl_ids)

        self.df['chembl_id'] = chembl_ids
        self.df['id'] = self.df.index
        self.orig_df = df.copy()

        # initialize UMAP
        self.umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)

        # Construct the UI
        self.app.layout = self.constuct_layout()

        # Register callbacks for selection inside main figure
        self.app.callback(
            [Output('selected_clusters', 'value'),
             Output('selected_point_cnt', 'children')],
            [Input('main-figure', 'clickData'),
             Input('main-figure', 'selectedData'),
             Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('northstar_cluster', 'children')],
            [State("selected_clusters", "value")]) (self.handle_data_selection)

        # Register callbacks for buttons for reclustering selected data
        self.app.callback(
            [Output('main-figure', 'figure'),
             Output('northstar_cluster', 'children')],
            [Input('bt_recluster_clusters', 'n_clicks'),
             Input('bt_recluster_points', 'n_clicks'),
             Input('bt_north_star', 'n_clicks'),
             Input('north_star', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input('sl_nclusters', 'value')],
            [State("selected_clusters", "value"),
             State("main-figure", "selectedData")]) (self.handle_re_cluster)

        # Register callbacks for selection inside main figure to update module details
        self.app.callback(
            [Output('tb_selected_molecules', 'children'),
             Output('sl_mol_props', 'options'),
             Output("current_page", "children"),
             Output("total_page", "children"),
             Output('section_molecule_details', 'style')],
            [Input('main-figure', 'selectedData'),
             Input('sl_mol_props', 'value'),
             Input('sl_prop_gradient', 'value'),
             Input("bt_page_prev", "n_clicks"),
             Input("bt_page_next", "n_clicks")],
             State("current_page", "children")) (self.handle_molecule_selection)

        self.app.callback(
            Output("hidden1", "children"),
            [Input("bt_reset", "n_clicks")]) (self.handle_reset)

        self.app.callback(
            Output('north_star', 'value'),
            [Input({'role': 'bt_star_candidate', 'index': ALL}, 'n_clicks')],
            State('north_star', 'value')) \
                (self.handle_mark_north_star)

Пример #4

Показать файл

    logger.info('Initializing Morgan fingerprints...')
    results = db.from_sequence(smiles_list).map(MorganFromSmiles).compute()

    np_array_fingerprints = np.stack(results).astype(np.float32)

    # take np.array shape (n_mols, nBits) for GPU DataFrame
    gdf = np2cudf(np_array_fingerprints)

    # prepare one set of clusters
    n_clusters = 7
    kmeans_float = KMeans(n_clusters=n_clusters)
    kmeans_float.fit(gdf)
    
    # UMAP
    umap = UMAP(n_neighbors=100,
                a=1.0,
                b=1.0,
                learning_rate=1.0)
    Xt = umap.fit_transform(gdf)
    gdf.add_column('x', Xt[0].to_array())
    gdf.add_column('y', Xt[1].to_array())

    gdf.add_column('cluster', kmeans_float.labels_)

    # start dash
    v = chemvisualize.ChemVisualization(
        gdf.copy(), n_clusters, chemblID_list)

    logger.info('navigate to https://localhost:5000')
    v.start('0.0.0.0')

Пример #5

Показать файл

Файл: visualize_minor.py Проект: songheony/TAkS

def draw_sampels(model, dataset, original_labels, algorithm, log_dir, seed,
                 ind_path, save_dir):
    if isinstance(log_dir, str):
        log_dir = Path(log_dir)
    if isinstance(save_dir, str):
        save_dir = Path(save_dir)

    os.makedirs(save_dir, exist_ok=True)

    algorithm_name = get_algorithms_name(log_dir, [algorithm])[0]
    model_dir = log_dir / algorithm_name / str(seed) / "model0"

    idx_path = model_dir / "selected_idx.pkl"
    selected_idxs = pickle.loads(Path(idx_path).read_bytes())

    pt_path = model_dir / "best_model.pt"
    model.load_state_dict(torch.load(pt_path))

    noise_ind = set(np.load(ind_path))
    selected_idx = set(selected_idxs[-1])

    labels = np.array(original_labels)

    fixed_train_dataloader = DataLoader(dataset,
                                        batch_size=256,
                                        shuffle=False,
                                        num_workers=8)
    model.eval()

    X = []
    with torch.no_grad():
        for i, (images, _, _) in enumerate(fixed_train_dataloader):
            if torch.cuda.is_available():
                images = images.to("cuda:0")
            output = model(images)
            output = output.cpu().numpy()
            X.append(output)

    X = np.concatenate(X, axis=0)

    tsne = UMAP(n_components=2, random_state=0)
    embedding = tsne.fit_transform(X)

    emb1 = embedding[:, 0]
    emb2 = embedding[:, 1]

    targets = ["clean", "noise", "select", "nonsel"]
    multi_fig = make_subplots(rows=2,
                              cols=2,
                              vertical_spacing=0.05,
                              horizontal_spacing=0.02)

    min_x, max_x = [], []
    min_y, max_y = [], []

    for n, target in enumerate(targets):
        fig = go.Figure()
        class_min_x, class_max_x = np.inf, 0
        class_min_y, class_max_y = np.inf, 0
        for i in range(len(dataset.classes)):
            class_ind = set(np.where(labels == i)[0])

            if target == "noise":
                target_ind = class_ind & noise_ind
            elif target == "clean":
                target_ind = class_ind - noise_ind
            elif target == "select":
                target_ind = class_ind & selected_idx
            elif target == "nonsel":
                target_ind = class_ind - selected_idx

            X = emb1[list(target_ind)]
            Y = emb2[list(target_ind)]

            if len(target_ind) > 0:
                class_min_x = min(min(X), class_min_x)
                class_max_x = max(max(X), class_max_x)
                class_min_y = min(min(Y), class_min_y)
                class_max_y = max(max(Y), class_max_y)

            if len(dataset.classes) <= 10:
                scatter = go.Scattergl(x=X, y=Y, mode="markers")
            else:
                color = hsv_to_rgb(360 / len(dataset.classes) * i, 1, 1)
                scatter = go.Scattergl(
                    x=X,
                    y=Y,
                    mode="markers",
                    marker_color=f"rgb({color[0]},{color[1]},{color[2]})",
                )

            fig.add_trace(scatter)

            multi_fig.add_trace(
                scatter,
                row=n // 2 + 1,
                col=n % 2 + 1,
            )
        min_x.append(class_min_x)
        max_x.append(class_max_x)
        min_y.append(class_min_y)
        max_y.append(class_max_y)
        fig.update_layout(
            autosize=False,
            width=1200,
            height=800,
            margin=go.layout.Margin(
                l=0,  # left margin
                r=0,  # right margin
                b=0,  # bottom margin
                t=0,  # top margin
            ),
            showlegend=False,
        )
        fig.update_xaxes(showticklabels=False)
        fig.update_yaxes(showticklabels=False)
        # fig.write_html(f"{save_dir}/umap-{target}.html")
        fig.write_image(f"{save_dir}/umap-{target}.pdf")

    multi_fig.update_layout(
        autosize=False,
        width=3200,
        height=2400,
        margin=go.layout.Margin(
            l=0,  # left margin
            r=0,  # right margin
            b=100,  # bottom margin
            t=0,  # top margin
        ),
        font=dict(family="Arial", size=84),
        showlegend=False,
    )
    multi_fig.update_xaxes(title_text="Clean samples", row=1, col=1)
    multi_fig.update_xaxes(title_text="Incorrectly labeled samples",
                           row=1,
                           col=2)
    multi_fig.update_xaxes(title_text="Selected samples", row=2, col=1)
    multi_fig.update_xaxes(title_text="Non-selected samples", row=2, col=2)
    # scaling
    multi_fig.update_xaxes(showticklabels=False,
                           range=[max(min_x), min(max_x)])
    multi_fig.update_yaxes(showticklabels=False,
                           range=[max(min_y), min(max_y)])
    # multi_fig.write_html(f"{save_dir}/umap.html")
    multi_fig.write_image(f"{save_dir}/umap.pdf")

Пример #6

Показать файл

Файл: _umap.py Проект: zwdiscover/scanpy

def umap(
    adata: AnnData,
    min_dist: float = 0.5,
    spread: float = 1.0,
    n_components: int = 2,
    maxiter: Optional[int] = None,
    alpha: float = 1.0,
    gamma: float = 1.0,
    negative_sample_rate: int = 5,
    init_pos: Union[_InitPos, np.ndarray, None] = 'spectral',
    random_state: AnyRandom = 0,
    a: Optional[float] = None,
    b: Optional[float] = None,
    copy: bool = False,
    method: Literal['umap', 'rapids'] = 'umap',
    neighbors_key: Optional[str] = None,
) -> Optional[AnnData]:
    """\
    Embed the neighborhood graph using UMAP [McInnes18]_.

    UMAP (Uniform Manifold Approximation and Projection) is a manifold learning
    technique suitable for visualizing high-dimensional data. Besides tending to
    be faster than tSNE, it optimizes the embedding such that it best reflects
    the topology of the data, which we represent throughout Scanpy using a
    neighborhood graph. tSNE, by contrast, optimizes the distribution of
    nearest-neighbor distances in the embedding such that these best match the
    distribution of distances in the high-dimensional space.  We use the
    implementation of `umap-learn <https://github.com/lmcinnes/umap>`__
    [McInnes18]_. For a few comparisons of UMAP with tSNE, see this `preprint
    <https://doi.org/10.1101/298430>`__.

    Parameters
    ----------
    adata
        Annotated data matrix.
    min_dist
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points on
        the manifold are drawn closer together, while larger values will result
        on a more even dispersal of points. The value should be set relative to
        the ``spread`` value, which determines the scale at which embedded
        points will be spread out. The default of in the `umap-learn` package is
        0.1.
    spread
        The effective scale of embedded points. In combination with `min_dist`
        this determines how clustered/clumped the embedded points are.
    n_components
        The number of dimensions of the embedding.
    maxiter
        The number of iterations (epochs) of the optimization. Called `n_epochs`
        in the original UMAP.
    alpha
        The initial learning rate for the embedding optimization.
    gamma
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.
    negative_sample_rate
        The number of negative edge/1-simplex samples to use per positive
        edge/1-simplex sample in optimizing the low dimensional embedding.
    init_pos
        How to initialize the low dimensional embedding. Called `init` in the
        original UMAP. Options are:

        * Any key for `adata.obsm`.
        * 'paga': positions from :func:`~scanpy.pl.paga`.
        * 'spectral': use a spectral embedding of the graph.
        * 'random': assign initial embedding positions at random.
        * A numpy array of initial embedding positions.
    random_state
        If `int`, `random_state` is the seed used by the random number generator;
        If `RandomState` or `Generator`, `random_state` is the random number generator;
        If `None`, the random number generator is the `RandomState` instance used
        by `np.random`.
    a
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    b
        More specific parameters controlling the embedding. If `None` these
        values are set automatically as determined by `min_dist` and
        `spread`.
    copy
        Return a copy instead of writing to adata.
    method
        Use the original 'umap' implementation, or 'rapids' (experimental, GPU only)
    neighbors_key
        If not specified, umap looks .uns['neighbors'] for neighbors settings
        and .obsp['connectivities'] for connectivities
        (default storage places for pp.neighbors).
        If specified, umap looks .uns[neighbors_key] for neighbors settings and
        .obsp[.uns[neighbors_key]['connectivities_key']] for connectivities.

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_umap** : `adata.obsm` field
        UMAP coordinates of data.
    """
    adata = adata.copy() if copy else adata

    if neighbors_key is None:
        neighbors_key = 'neighbors'

    if neighbors_key not in adata.uns:
        raise ValueError(
            f'Did not find .uns["{neighbors_key}"]. Run `sc.pp.neighbors` first.'
        )
    start = logg.info('computing UMAP')

    neighbors = NeighborsView(adata, neighbors_key)

    if 'params' not in neighbors or neighbors['params']['method'] != 'umap':
        logg.warning(
            f'.obsp["{neighbors["connectivities_key"]}"] have not been computed using umap'
        )

    # Compat for umap 0.4 -> 0.5
    with warnings.catch_warnings():
        # umap 0.5.0
        warnings.filterwarnings("ignore", message=r"Tensorflow not installed")
        import umap

    if version.parse(umap.__version__) >= version.parse("0.5.0"):

        def simplicial_set_embedding(*args, **kwargs):
            from umap.umap_ import simplicial_set_embedding

            X_umap, _ = simplicial_set_embedding(
                *args,
                densmap=False,
                densmap_kwds={},
                output_dens=False,
                **kwargs,
            )
            return X_umap

    else:
        from umap.umap_ import simplicial_set_embedding
    from umap.umap_ import find_ab_params

    if a is None or b is None:
        a, b = find_ab_params(spread, min_dist)
    else:
        a = a
        b = b
    adata.uns['umap'] = {'params': {'a': a, 'b': b}}
    if isinstance(init_pos, str) and init_pos in adata.obsm.keys():
        init_coords = adata.obsm[init_pos]
    elif isinstance(init_pos, str) and init_pos == 'paga':
        init_coords = get_init_pos_from_paga(
            adata, random_state=random_state, neighbors_key=neighbors_key
        )
    else:
        init_coords = init_pos  # Let umap handle it
    if hasattr(init_coords, "dtype"):
        init_coords = check_array(init_coords, dtype=np.float32, accept_sparse=False)

    if random_state != 0:
        adata.uns['umap']['params']['random_state'] = random_state
    random_state = check_random_state(random_state)

    neigh_params = neighbors['params']
    X = _choose_representation(
        adata,
        neigh_params.get('use_rep', None),
        neigh_params.get('n_pcs', None),
        silent=True,
    )
    if method == 'umap':
        # the data matrix X is really only used for determining the number of connected components
        # for the init condition in the UMAP embedding
        n_epochs = 0 if maxiter is None else maxiter
        X_umap = simplicial_set_embedding(
            X,
            neighbors['connectivities'].tocoo(),
            n_components,
            alpha,
            a,
            b,
            gamma,
            negative_sample_rate,
            n_epochs,
            init_coords,
            random_state,
            neigh_params.get('metric', 'euclidean'),
            neigh_params.get('metric_kwds', {}),
            verbose=settings.verbosity > 3,
        )
    elif method == 'rapids':
        metric = neigh_params.get('metric', 'euclidean')
        if metric != 'euclidean':
            raise ValueError(
                f'`sc.pp.neighbors` was called with `metric` {metric!r}, '
                "but umap `method` 'rapids' only supports the 'euclidean' metric."
            )
        from cuml import UMAP

        n_neighbors = neighbors['params']['n_neighbors']
        n_epochs = (
            500 if maxiter is None else maxiter
        )  # 0 is not a valid value for rapids, unlike original umap
        X_contiguous = np.ascontiguousarray(X, dtype=np.float32)
        umap = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            n_epochs=n_epochs,
            learning_rate=alpha,
            init=init_pos,
            min_dist=min_dist,
            spread=spread,
            negative_sample_rate=negative_sample_rate,
            a=a,
            b=b,
            verbose=settings.verbosity > 3,
            random_state=random_state,
        )
        X_umap = umap.fit_transform(X_contiguous)
    adata.obsm['X_umap'] = X_umap  # annotate samples with UMAP coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n' "    'X_umap', UMAP coordinates (adata.obsm)"),
    )
    return adata if copy else None

Пример #7

Показать файл

def main(args=None):
    if args is None:
        args = sys.argv[1:]

    parser = argparse.ArgumentParser(description="UMAP encoding script")
    parser.add_argument(
        '--run-name',
        dest='run_name',
        help=
        "Training run directory (for the plot to be placed in the right logs directory)",
        required=True)

    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('-f',
                       '--file-path',
                       dest='encodings_file_path',
                       help="encodings file path",
                       default=argparse.SUPPRESS)
    group.add_argument(
        '--files-all',
        dest='files_all',
        action='store_true',
        help="cluster using encodings of all available training stages",
        default=argparse.SUPPRESS)

    args = parser.parse_args(args)

    assert (('encodings_file_path' in args.__dict__.keys())
            or ('files_all' in args.__dict__.keys()))
    if 'encodings_file_path' in args.__dict__.keys():
        assert DoesPathExistAndIsFile(
            args.encodings_file_path
        ), f'path {args.encodings_file_path} does not exist or is not a file'

    curr_run_name = args.run_name
    logs_dir = os.path.join(os.path.abspath('./'), 'logs', curr_run_name)
    encodings_dir = os.path.join(logs_dir, 'data_encodings')
    umap_plots_dir = os.path.join(logs_dir, 'umap_plots')
    assert DoesPathExistAndIsDirectory(
        logs_dir), f'path {logs_dir} does not exist or not a directory'
    assert DoesPathExistAndIsDirectory(
        encodings_dir
    ), f'path {encodings_dir} does not exist or not a directory'
    EnsureDirectoryExists(umap_plots_dir)

    if 'encodings_file_path' in args.__dict__.keys():
        encodings_files_paths = [args.encodings_file_path]
    elif (('files_all' in args.__dict__.keys()) and (args.files_all)):
        encodings_files_paths = find_files(encodings_dir, '*encoded.npz')
        encodings_files_paths.sort()

    for encodings_file_path in tqdm(encodings_files_paths):
        enc = np.load(encodings_file_path)
        zn = enc['zn']
        zc_logits = enc['zc_logits']
        labels = enc['labels']

        # print('zn shape: ', zn.shape)
        # print('zc logits shape: ', zc_logits.shape)
        # print('true labels shape:', labels.shape)

        umap = UMAP(n_components=2,
                    verbose=False,
                    n_epochs=4096,
                    learning_rate=0.1)
        umap_enc = umap.fit_transform(zn)
        classes = np.unique(labels)
        colors = cm.tab20(np.linspace(0.0, 1.0, len(classes)))
        colors = dict(zip(classes, colors))

        f = plt.figure(figsize=(6, 6), dpi=300)
        for label in classes:
            plt.scatter(umap_enc[labels == label, 0],
                        umap_enc[labels == label, 1],
                        s=0.5,
                        color=colors[label],
                        label=label)
        lgnd = plt.legend(fontsize=5)
        for hndl in lgnd.legendHandles:
            hndl.set_sizes([20])
        plt.axis('equal')
        plt.tight_layout()

        figname = os.path.join(
            umap_plots_dir, 'umap-%s.png' %
            os.path.splitext(os.path.basename(encodings_file_path))[0])
        f.savefig(
            figname,
            dpi=300,
            bbox_inches=0,
            pad_inches=0,
        )
        plt.close()