Python UMAP.fit_transform примеры, umap.UMAP.fit_transform Python примеры использования

Пример #1

0

Показать файл

 def umap(self, n_components, metric, data=None):
     model= UMAP(n_components=n_components,metric=metric)
     if data is not None:
         reduced_data = model.fit_transform(data)
     else:
         reduced_data = model.fit_transform(self.data)
     return reduced_data

Пример #2

0

Показать файл

 def plot_umap_proc(self, df):
     folder = self.plot_path
     umap_2d = UMAP(n_components=2, spread=1, min_dist=0.5, a=0.7, b=1.2)
     umap_3d = UMAP(n_components=3, spread=1, min_dist=0.5, a=0.7, b=1.2)
     proj_2d = umap_2d.fit_transform(np.array(df.Vector.tolist()))
     proj_3d = umap_3d.fit_transform(np.array(df.Vector.tolist()))
     self.plot_umap(folder,proj_2d,proj_3d,df.Categ,"Categ","category-umap")
     self.plot_umap(folder,proj_2d,proj_3d,df.subject,"subject","subject-umap")
     self.plot_umap(folder,proj_2d,proj_3d,df.chn,"chn","channel-umap")

Пример #3

0

Показать файл

class manifold_umap(base_manifold):
    def __init__(self, parent=None, name='none'):
        base_manifold.__init__(self,
                               parent=parent,
                               name=name,
                               manifold_type='UMAP')

    def train(self, num_pc, n_neighbors=None, min_dist=0.3):
        """
        **Purpose**
            Train the UMAP on the first <num_pc> components of a PCA

            UMAP is generally too computationally heavy to do on a full dataset, so you
            should choose the first few PCs to train the tSNE. Check the pca module
            for a PCA interface you can use to select the best PCs

        **Arguments**
            n_neighbors (Required)
                Estimated number of neighbours

            min_dist (Optional, default=0.3)
                minimum distance between points

        **Returns**
            None
        """
        assert self.configured, 'umap is not configured, run configure() first'
        assert n_neighbors, 'You must specify an estimate for n_neighbors'

        if isinstance(num_pc, int):
            self.__model = PCA(n_components=num_pc, whiten=self.whiten)
            self.__transform = self.__model.fit_transform(self.data_table)
            self.__pcas = self.__transform

        elif isinstance(num_pc, list):
            self.__model = PCA(n_components=max(num_pc) + 1,
                               whiten=self.whiten)
            self.__transform = self.__model.fit_transform(self.data_table)
            # get only the specific PCs
            self.__pcas = numpy.array(
                [self.__transform[:, c - 1] for c in num_pc]).T
        else:
            raise AssertionError('num_pcs must be either an integer or a list')

        self.__model = UMAP(n_components=2,
                            n_neighbors=n_neighbors,
                            metric='correlation',
                            random_state=self.random_state,
                            verbose=self.verbose)

        self.npos = self.__model.fit_transform(self.__pcas)

        self.trained = True

Пример #4

0

Показать файл

def calc_umap(X, n_components, n_neighbors, min_dist, spread, random_state):
    umap = UMAP(n_components=n_components,
                n_neighbors=n_neighbors,
                min_dist=min_dist,
                spread=spread,
                random_state=random_state)
    return umap.fit_transform(X)

Пример #5

0

Показать файл

def umap(feats, indices):
    metric = st.selectbox('Metric', [
        'euclidean', 'manhattan', 'chebyshev', 'minkowski', 'canberra',
        'braycurtis', 'mahalanobis', 'wminkowski', 'seuclidean', 'cosine',
        'correlation'
    ])
    n_neighbors = st.slider('N Neighbors',
                            min_value=2,
                            max_value=200,
                            value=15,
                            step=1)
    min_dist = st.slider('Minimum Distance',
                         min_value=0.0,
                         max_value=1.0,
                         value=0.1,
                         step=0.01)

    model = UMAP(n_components=3,
                 n_neighbors=n_neighbors,
                 min_dist=min_dist,
                 metric=metric)

    results = model.fit_transform(feats[indices, :])

    return results

Пример #6

0

Показать файл

Файл: test_umap_ops.py Проект: ginihumer/latent-projective-interventions

def test_umap_transform_embedding_stability(iris, iris_selection):
    """Test that transforming data does not alter the learned embeddings

    Issue #217 describes how using transform to embed new data using a
    trained UMAP transformer causes the fitting embedding matrix to change
    in cases when the new data has the same number of rows as the original
    training data.
    """

    data = iris.data[iris_selection]
    fitter = UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(data)
    original_embedding = fitter.embedding_.copy()

    # The important point is that the new data has the same number of rows
    # as the original fit data
    new_data = np.random.random(data.shape)
    _ = fitter.transform(new_data)

    assert_array_equal(
        original_embedding,
        fitter.embedding_,
        "Transforming new data changed the original embeddings",
    )

    # Example from issue #217
    a = np.random.random((1000, 10))
    b = np.random.random((1000, 5))

    umap = UMAP()
    u1 = umap.fit_transform(a[:, :5])
    u1_orig = u1.copy()
    assert_array_equal(u1_orig, umap.embedding_)

    _ = umap.transform(b)
    assert_array_equal(u1_orig, umap.embedding_)

Пример #7

0

Показать файл

def plot_projections(embeds,
                     speakers,
                     ax=None,
                     colors=None,
                     markers=None,
                     legend=True,
                     title=""):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    # Compute the 2D projections. You could also project to another number of dimensions (e.g.
    # for a 3D plot) or use a different different dimensionality reduction like PCA or TSNE.
    reducer = UMAP()
    projs = reducer.fit_transform(embeds)

    # Draw the projections
    speakers = np.array(speakers)
    colors = colors or _my_colors
    for i, speaker in enumerate(np.unique(speakers)):
        speaker_projs = projs[speakers == speaker]
        marker = "o" if markers is None else markers[i]
        label = speaker if legend else None
        ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label)

    if legend:
        ax.legend(title="Speakers", ncol=2)
    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect("equal")

    return projs

Пример #8

0

Показать файл

Файл: dimens_reduction.py Проект: astrazeneca-cgr-publications/mantis-ml-release

    def calc_umap(self,
                  df,
                  n_neighbors=5,
                  min_dist=0.3,
                  metric='correlation',
                  data_type='original_data'):

        print(">> Running UMAP from " + data_type + "...")
        tmp_drop_cols = ['Gene_Name', self.cfg.Y]
        X = df.drop(tmp_drop_cols, axis=1)

        umap = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
        t0 = time()
        X_umap = umap.fit_transform(X)
        total_time = time() - t0

        X_umap = pd.DataFrame(X_umap)
        X_umap.columns = [('d' + str(c)) for c in X_umap.columns.values]
        #print(X_umap)

        X_umap = pd.concat([X_umap, df[tmp_drop_cols]], axis=1)

        filepath = str(self.cfg.unsuperv_out / ("UMAP" + data_type + ".tsv"))
        X_umap.to_csv(filepath, sep='\t', index=None)

        return X_umap, total_time

Пример #9

0

Показать файл

def embeddingUmap(n_components, n_neighbors, random_state, tfidf_matrix_fit, tfidf_matrix_transform):
    umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, random_state=random_state).fit(tfidf_matrix_fit)
    print("reducing vector's dimensionality...")
    umap_embedding = umap.fit_transform(tfidf_matrix_transform)
    umap_df = pd.DataFrame(umap_embedding, columns=[f'emb_{i + 1}' for i in range(n_components)])

    return umap_df, umap_embedding

Пример #10

0

Показать файл

def dim_red_kmeans(data, cluster, technique):
    if cluster == 'renda':
        features = data.loc[:, 'gdp_per_capita':]
    else:
        features = data.loc[:, 'cardiovasc_death_rate':]

    if technique == 'umap':
        umap_2d = UMAP(n_components=2, init='random', random_state=0)
        proj_2d = umap_2d.fit_transform(features)
    elif technique == 'pca':
        pca = PCA(n_components=2, random_state=0)
        proj_2d = pca.fit(features).transform(features)
    else:
        tsne = TSNE(n_components=2, random_state=0)
        proj_2d = tsne.fit_transform(features)

    kmeans = KMeans(n_clusters=7,
                    init="k-means++",
                    max_iter=500,
                    n_init=10,
                    random_state=123)
    identified_clusters = kmeans.fit_predict(proj_2d)

    data['Cluster'] = identified_clusters

    return px.scatter(proj_2d,
                      x=0,
                      y=1,
                      color=data.Cluster,
                      labels={'color': 'Cluster'},
                      hover_name=data.location)

Пример #11

0

Показать файл

Файл: feature_analyzer.py Проект: sailfish009/bondnet

class UMAPAnalyzer(BaseAnalyzer):
    """
    UMAP analysis for features.
    """
    def compute(
        self,
        n_neighbors=100,
        n_components=2,
        min_dist=0.5,
        metric="euclidean",
        verbose=True,
        n_epochs=1000,
        **kwargs,
    ):
        self.model = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_dist,
            metric=metric,
            verbose=True,
            n_epochs=n_epochs,
            **kwargs,
        )
        embedding = self.model.fit_transform(self.features)
        self.embedding = embedding

        return self.embedding

Пример #12

0

Показать файл

Файл: plotting.py Проект: Amuoeba/ARP_modeling

def plot_projections(embeds,
                     speakers,
                     ax=None,
                     colors=None,
                     markers=None,
                     legend=True,
                     title="",
                     **kwargs):
    if ax is None:
        _, ax = plt.subplots(figsize=(6, 6))

    reducer = UMAP(**kwargs)

    projs = reducer.fit_transform(embeds)

    speakers = np.array(speakers)
    colors = colors or _embedding_colors_
    for i, speaker in enumerate(np.unique(speakers)):
        speaker_projs = projs[speakers == speaker]
        marker = "o" if markers is None else markers[i]
        label = speaker if legend else None
        ax.scatter(*speaker_projs.T, c=[colors[i]], marker=marker, label=label)

    ax.set_title(title)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect("equal")

    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show()
    return projs

Пример #13

0

Показать файл

def update_figure(selected_dataset):
    if selected_dataset == "MNIST-Digits":
        X = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-input.csv"
        )
        y = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/mnist-1000-labels.csv"
        )
        y = np.unique(y, return_inverse=True)[1]

    elif selected_dataset == "MNIST-Fashion":
        X = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-input.csv"
        )
        y = pd.read_csv(
            "https://saturn-public-data.s3.us-east-2.amazonaws.com/MNIST-1000/fashion-1000-labels.csv"
        )
        y = np.unique(y, return_inverse=True)[1]

    else:
        return None, "Please select a dataset."

    umap_3d = UMAP(n_components=3, init="random", random_state=0)

    proj_3d = umap_3d.fit_transform(X, y=y)

    fig = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=y)

    fig.update_layout(transition_duration=500, height=1000)
    fig.update(layout_coloraxis_showscale=False)
    fig.update_traces(marker_size=2)

    return fig

Пример #14

0

Показать файл

Файл: new_insane.py Проект: redjerdai/thethWyrm

class UMAP:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = UMAP_(*args, **kwargs)

    def fit(self, X, y):
        pass

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.fit_transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z

Пример #15

0

Показать файл

def project_umap(spk_dict: Dict[str, Tensor], seed):
    sorted_speakers = sorted(list(spk_dict.keys()))
    flat_embs = torch.cat([spk_dict[k] for k in sorted_speakers],
                          dim=0).numpy()
    try:
        from umap import UMAP
        from sklearn.preprocessing import StandardScaler
        import matplotlib.pyplot as plt
    except ModuleNotFoundError:
        raise ModuleNotFoundError(
            'Please install umap, sklearn, and matplotlib from pypi to plot umap results.'
        )
    data = StandardScaler().fit_transform(flat_embs)
    reducer = UMAP(metric='cosine',
                   verbose=True,
                   n_neighbors=20,
                   random_state=seed)
    reduced_data = reducer.fit_transform(data)
    print(reduced_data.shape)
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 9))
    reduced_chunks = torch.from_numpy(reduced_data).chunk(len(spk_dict), dim=0)
    for s, c in zip(sorted_speakers, reduced_chunks):
        ax.scatter(c.numpy()[:, 0], c.numpy()[:, 1])
    ax.legend(sorted_speakers)
    ax.set_xlabel('umap 1st component')
    ax.set_ylabel('umap 2nd component')
    ax.set_title("2D umap projection with n_neighbors=20")
    ax.grid(True)
    plt.tight_layout()
    plt.savefig('umap_plot.svg')
    print("Saved umap plot to umap_plot.svg")

Пример #16

0

Показать файл

def run_umap(dist, logger=None, labels=None, **kwargs):
    """
    Run MDS on distances produced by tree2dmat

    Args:
        dist (str):             A distance matrix, square or condensed form
        n_components (int):     number of components to produce
        metric (bool):          Whether or not to run metric MDS. default is to run non-metric
        logger (Logger):        Logger to use. default is no logging

    Return:
        emb (np.array):         the MDS embedding
    """
    if len(dist.shape) == 1:
        if logger is not None:
            logger.info('computing squareform')
        dist = _squareform(dist)

    kwargs.setdefault('n_neighbors', 100)
    kwargs.setdefault('n_components', 2)

    if logger is not None:
        logger.info(
            'computing {n_components} components with UMAP'.format(**kwargs))
        logger.info(
            'using {n_neighbors} neighbors and {min_dist} min_dist'.format(
                **kwargs))

    kwargs['verbose'] = True
    umap = UMAP(**kwargs)
    emb = umap.fit_transform(dist, y=labels)
    return emb

Пример #17

0

Показать файл

def vanDongenSpectral(args):

    neighbors, min_d, components, metric, dataset, scaler, k = args

    print(dataset + ', ' + metric + ', ' + scaler + ', n_components=' +
          str(components) + ', n_neighbors=' + str(neighbors) + ', min_dist=' +
          str(min_d) + ', k=' + str(k))

    # Se estandariza usando el scaler correspondiente
    df = scalers[scaler].fit_transform(datasets[dataset])

    # Se aplica UMAP
    um = UMAP(n_components=components,
              n_neighbors=neighbors,
              min_dist=min_d,
              metric=metric)
    embedding = um.fit_transform(df)

    # Se aplica KMeans al embedding
    km = KMeans(n_clusters=k, random_state=0).fit(embedding)

    # Se calcula la matriz de confusion
    tmp = pd.DataFrame({'Generos': metadata.genre, 'data': km.labels_})
    ct = pd.crosstab(tmp['Generos'], tmp['data'])

    return vanDongen(ct)

Пример #18

0

Показать файл

def umapper(embed, metric="euclidean", n_neighbors=30, min_dist=1, **kws):
    umap = UMAP(metric=metric, n_neighbors=n_neighbors, min_dist=min_dist)
    umap_euc = umap.fit_transform(embed)
    plot_df = pd.DataFrame(data=umap_euc)
    plot_df["labels"] = labels
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    plot_kws = dict(
        x=0,
        y=1,
        hue="labels",
        palette=CLASS_COLOR_DICT,
        legend=False,
        s=20,
        linewidth=0.5,
        alpha=0.7,
    )
    sns.scatterplot(data=plot_df, ax=ax, **plot_kws)
    ax.axis("off")
    left_right_indexing = True
    if left_right_indexing:
        tlp_inds = np.arange(len(embed) // 2)
        trp_inds = np.arange(len(embed) // 2) + len(embed) // 2
        add_connections(
            plot_df.iloc[tlp_inds, 0],
            plot_df.iloc[trp_inds, 0],
            plot_df.iloc[tlp_inds, 1],
            plot_df.iloc[trp_inds, 1],
            ax=ax,
        )
    return fig, ax

Пример #19

0

Показать файл

Файл: plot.py Проект: dohlee/python-dohlee

def umap(data, labels=None, ax=None, **kwargs):
    '''Draw a UMAP embedding plot of the data.

    :param matrix data: Input data. Numpy array recommended.
    :param list labels: (Optional) Corresponding labels to each datum. If specified, data points in the plot will be colored according to the label.
    :param axis ax: (Optional) Matplotlib axis to draw the plot on.
    :param kwargs: Any other keyword arguments will be passed onto matplotlib.pyplot.scatter.
    '''
    # Apply UMAP and get embeddings.
    reducer = UMAP()
    embeddings = reducer.fit_transform(data)

    if labels is None:
        ax.scatter(x=embeddings[:, 0], y=embeddings[:, 1], **kwargs)

    else:
        # If labels are attached, color them in different colors
        labels = np.array(labels)
        for label in set(labels):
            toDraw = (labels == label)  # only draw these points this time

            ax.scatter(x=embeddings[toDraw, 0],
                       y=embeddings[toDraw, 1],
                       label=label,
                       **kwargs)
            ax.legend(loc='best')
    return ax

Пример #20

0

Показать файл

def main(dataset):
    adata = getdata(dataset)

    def saveplot(coords, dimred):
        plt.figure()
        plt.scatter(
            coords[:, 0],
            coords[:, 1],
            s=2,
            c=adataproj.obs["y"].values % 9,
            cmap="Set1",
        )
        plt.tick_params(
            axis="both",
            which="both",
            bottom=False,
            labelbottom=False,
            left=False,
            labelleft=False,
        )
        plt.savefig(
            f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.pdf",
            format="pdf",
        )
        plt.savefig(
            f"figures/dimred/{dataset}_{alg}_{n_markers}markers_{dimred}.png",
            format="png",
        )
        plt.close()

    for alg in [
            "cife",
            "bincife",
            "jmi",
            "binmim",
            "logreg",
            "t-test_overestim_var",
            "wilcoxon",
    ]:
        markers = np.load(
            f"output/{dataset}_{alg}_markers_full.npz")["markers"]
        if len(markers.shape) > 1:
            markers = markers[:, 0].flatten()
        else:
            markers = markers[:10]
        n_markers = len(markers)
        adataproj = adata[:, markers].copy()
        plotprep(adataproj)
        print("Computing PCA coords")
        Xpca = pr.plot.pca(adataproj.X, 2, return_info=False)
        saveplot(Xpca, "pca")
        print("Computing tSNE coords")
        t = TSNE()
        Xtsne = t.fit_transform(adataproj.X.toarray())
        saveplot(Xtsne, "tsne")
        print("Computing UMAP coords")
        u = UMAP()
        Xumap = u.fit_transform(adataproj.X)
        saveplot(Xumap, "umap")

Пример #21

0

Показать файл

Файл: plot.py Проект: umangv/picturedrocks

def umapfigure(adata, **scatterkwargs):
    if "X_umap" not in adata.obsm_keys():
        if "X_pca" not in adata.obsm_keys(
        ) or adata.obsm["X_pca"].shape[1] < 30:
            pca(adata, 30, zero_center=not scipy.sparse.issparse(adata.X))
        umap = UMAP()
        adata.obsm["X_umap"] = umap.fit_transform(adata.obsm["X_pca"][:, :30])
    return genericplot(adata, adata.obsm["X_umap"], **scatterkwargs)

Пример #22

0

Показать файл

Файл: _ss_cluster.py Проект: ArjunSarathi/cellar

 def get(self, x, labels, clu, eval):
     umap = UMAP(**self.kwargs)
     self.logger.info("Finding embeddings.")
     emb = umap.fit_transform(x, y=labels)
     new_labels = clu.get(emb, eval)
     ind = np.where(labels != -1)
     new_labels[ind] = labels[ind]
     return new_labels

Пример #23

0

Показать файл

class UMAP_Preprocessed:
    def __init__(self, *args, **kwargs):
        self.preprocessor = UMAP(n_neighbors=30, min_dist=0, n_components=2)
        self.clusterer = None

    def fit_predict(self, X):
        X = self.preprocessor.fit_transform(X)
        return self.clusterer.fit_predict(X)

Пример #24

0

Показать файл

Файл: visualize_predictions.py Проект: anuprulez/single_cell_analysis

 def plot_UMAP(self, features):
     umap_2d = UMAP(n_components=2, init='random', random_state=0)
     print("Computing projections...")
     proj_2d = umap_2d.fit_transform(features)
     print("Plotting...")
     sns.scatterplot(data=proj_2d)
     plt.grid(True)
     plt.show()

Пример #25

0

Показать файл

def umap_reduce(data, **kwargs):
    try:
        reducer = cumlUMAP(**kwargs)
        embedding = reducer.fit_transform(data)
    except (RuntimeError, TypeError) as e:
        warnings.warn(e)
        reducer = UMAP(**kwargs)
        embedding = reducer.fit_transform(data)
    return embedding, reducer

Пример #26

0

Показать файл

def embed_umap(data):
    """data should be on cpu, numpy"""
    embedding = UMAP(
        metric='euclidean',
        n_neighbors=40,
        # angular_rp_forest=True,
        # random_state=torch.initial_seed(),
        transform_seed=torch.initial_seed())
    return embedding.fit_transform(data)

Пример #27

0

Показать файл

Файл: DocumentFeatureVisualization.py Проект: aidowu1/Ades-NLP-Recepies

 def umapDataReductionTo2D(self):
     """
     UMAP - Uniform Manifold Approximation and Projection method to used to reduce the dimensionality of Target/Reference vectors to 2-D using Multi Dimension Scaling (MDS)
     :return: None
     """
     umap = UMAP(n_components=2, random_state=1)
     reduced_feature_matrix = umap.fit_transform(self.__vectorized_corpus)
     self.__reduced_dim_feature_data = FeatureMatrixData(
         reduced_feature_matrix, self.__document_ids)

Пример #28

0

Показать файл

def reduceWithUMAP(vectors, size):
    log(f'Reducing data to {size} features using UMAP (slow-ish)')
    umap = UMAP(n_neighbors=15,
                min_dist=0.1,
                metric='euclidean',
                n_components=size)
    vecs = umap.fit_transform(vectors)

    return vecs

Пример #29

0

Показать файл

Файл: pixplot.py Проект: herlai/pix-plot

def get_umap_projection(**kwargs):
    '''Get the x,y positions of images passed through a umap projection'''
    print(' * creating UMAP layout')
    out_path = get_path('layouts', 'umap', **kwargs)
    if os.path.exists(out_path) and kwargs['use_cache']: return out_path
    model = UMAP(n_neighbors=kwargs['n_neighbors'],
                 min_dist=kwargs['min_dist'],
                 metric=kwargs['metric'])
    z = model.fit_transform(kwargs['vecs'])
    return write_layout(out_path, z, **kwargs)

Пример #30

0

Показать файл

    def on_epoch_begin(self, model):
        print(
            f"\n----------------\n\nEnd of epoch {self.epoch}. Getting scores..."
        )
        scores = defaultdict(list)
        scores["epoch"] = self.epoch
        for df, seed in test_data:
            print(f"Vectorize...")

            docvecs = df["text"].progress_apply(lambda x: simple_preprocess(x))
            docvecs = docvecs.progress_apply(lambda x: model.infer_vector(x))

            print(f"Reduce dimensions...")
            dim_reducer = UMAP(metric="cosine",
                               set_op_mix_ratio=1.0,
                               n_components=256,
                               random_state=42)

            dim_reduced_vecs = dim_reducer.fit_transform(list(docvecs))

            print(f"Run ivis...")
            dim_reducer = Ivis(embedding_dims=1,
                               k=15,
                               model="maaten",
                               n_epochs_without_progress=10,
                               verbose=0)
            decision_scores = dim_reducer.fit_transform(dim_reduced_vecs)
            decision_scores = decision_scores.astype(float)

            print(f"Get and save scores...")
            preds = reject_outliers(decision_scores,
                                    iq_range=1.0 - contamination)
            preds = [-1 if x else 1 for x in preds]

            scores = get_scores(scores, df["outlier_label"], preds)
            scores["seed"] = seed
            print(
                f"Scores for epoch {self.epoch} | seed - {seed}:\n{pd.DataFrame(scores, index=[0])}"
            )

            self.result_df = self.result_df.append(scores, ignore_index=True)
            self.result_df.to_csv(self.log_path, sep="\t")
        self.epoch += 1

Пример #31

0

Показать файл

Файл: transform.py Проект: alexeyche/alexeyche-junk

class TUmap(Transform):
    """
    n_neighbors:
        This determines the number of neighboring points used in local approximations
        of manifold structure. Larger values will result in more global structure being
        preserved at the loss of detailed local structure.
        In general this parameter should often be in the range 5 to 50,
        with a choice of 10 to 15 being a sensible default.
    min_dist:
        This controls how tightly the embedding is allowed compress points together.
        Larger values ensure embedded points are more evenly distributed, while smaller
        values allow the algorithm to optimise more accurately with regard to local structure.
        Sensible values are in the range 0.001 to 0.5, with 0.1 being a reasonable default.
    metric:
        This determines the choice of metric used to measure distance in the input space.
        A wide variety of metrics are already coded, and a user defined function can be passed
        as long as it has been JITd by numba.
    """

    def __init__(
        self,
        n_neighbors=15,
        min_dist=0.1,
        metric="euclidean",
        n_components=2,
        spread=1.0,
        random_state=None
    ):
        self._inst = UMAP(
            n_neighbors = n_neighbors,
            min_dist = min_dist,
            metric = metric,
            n_components=n_components,
            spread=spread,
        )


    def transform(self, fp):
        x = FeaturePool(fp).array()
        logger.info("TUmap: starting UMAP transform ...")
        x_emb = self._inst.fit_transform(x)
        logger.info("TUamp: Done")

        for f_id in range(x_emb.shape[1]):
            yield Feature(
                "UMAP feature #{}".format(f_id),
                x_emb[:, f_id]
            )

    @staticmethod
    def plot_embedding(efp: FeaturePool, split_by=None):
        x = efp.array()
        assert x.shape[1] == 2, "Embedding is expected to be with the size 2 to plot, got {}".format(x.shape[1])
        fig = plt.figure(figsize=(7, 7))
        ax = fig.add_subplot(111)
        if split_by is not None:
            d = split_by.data
            ax.scatter(x[:, 0], x[:, 1], c=d, alpha=0.5)
        else:
            ax.scatter(x[:, 0], x[:, 1], alpha=0.5)
        if split_by is not None:
            ax.set_title(
                "UMAP for a feature pool splitted by feature `{}`".format(split_by.name)
            )
        else:
            ax.set_title(
                "UMAP for a feature pool"
            )
        fig.show()

Пример #32

0

Показать файл

Файл: umap_transformer.py Проект: cmagnusb/machine_learning_examples

from datetime import datetime
from util import getKaggleMNIST
from sklearn.linear_model import LogisticRegression
from umap import UMAP

# get the data
Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST()

print("Score without transformation:")
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))


umapper = UMAP(n_neighbors=5, n_components=10)
t0 = datetime.now()
Ztrain = umapper.fit_transform(Xtrain)
print("umap fit_transform took:", datetime.now() - t0)
t0 = datetime.now()
Ztest = umapper.transform(Xtest)
print("umap transform took:", datetime.now() - t0)

print("Score with transformation")
model = LogisticRegression()
t0 = datetime.now()
model.fit(Ztrain, Ytrain)
print("logistic regression fit took:", datetime.now() - t0)
print(model.score(Ztrain, Ytrain))
print(model.score(Ztest, Ytest))

Python UMAP.fit_transform примеры использования