Пример #1
0
class Model():

    def __init__(self):
        self.knn = KNeighborsTransformer(n_neighbors=5,n_jobs=-1)

    def fit(self, dtm):
        self.knn.fit(dtm, dtm.index.tolist())

    def predict(self):
        pass
Пример #2
0
def test_transformers():
    """Test that AnnoyTransformer and KNeighborsTransformer give same results"""
    X = np.random.RandomState(42).randn(10, 2)

    knn = KNeighborsTransformer()
    Xt0 = knn.fit_transform(X)

    ann = AnnoyTransformer()
    Xt1 = ann.fit_transform(X)

    nms = NMSlibTransformer()
    Xt2 = nms.fit_transform(X)

    assert_array_almost_equal(Xt0.toarray(), Xt1.toarray(), decimal=5)
    assert_array_almost_equal(Xt0.toarray(), Xt2.toarray(), decimal=5)
Пример #3
0
def test_isomap():
    # Test chaining KNeighborsTransformer and Isomap with
    # neighbors_algorithm='precomputed'
    algorithm = 'auto'
    n_neighbors = 10

    X, _ = make_blobs(random_state=0)
    X2, _ = make_blobs(random_state=1)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors,
                              algorithm=algorithm,
                              mode='distance'),
        Isomap(n_neighbors=n_neighbors, metric='precomputed'))
    est_compact = Isomap(n_neighbors=n_neighbors,
                         neighbors_algorithm=algorithm)

    Xt_chain = est_chain.fit_transform(X)
    Xt_compact = est_compact.fit_transform(X)
    assert_array_almost_equal(Xt_chain, Xt_compact)

    Xt_chain = est_chain.transform(X2)
    Xt_compact = est_compact.transform(X2)
    assert_array_almost_equal(Xt_chain, Xt_compact)
Пример #4
0
def test_lof_novelty_true():
    # Test chaining KNeighborsTransformer and LocalOutlierFactor
    n_neighbors = 4

    rng = np.random.RandomState(0)
    X1 = rng.randn(40, 2)
    X2 = rng.randn(40, 2)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance"),
        LocalOutlierFactor(
            metric="precomputed",
            n_neighbors=n_neighbors,
            novelty=True,
            contamination="auto",
        ),
    )
    est_compact = LocalOutlierFactor(n_neighbors=n_neighbors,
                                     novelty=True,
                                     contamination="auto")

    pred_chain = est_chain.fit(X1).predict(X2)
    pred_compact = est_compact.fit(X1).predict(X2)
    assert_array_almost_equal(pred_chain, pred_compact)
Пример #5
0
def test_spectral_embedding():
    # Test chaining KNeighborsTransformer and SpectralEmbedding
    n_neighbors = 5

    n_samples = 1000
    centers = np.array([
        [0.0, 5.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 4.0, 0.0, 0.0],
        [1.0, 0.0, 0.0, 5.0, 1.0],
    ])
    S, true_labels = make_blobs(n_samples=n_samples,
                                centers=centers,
                                cluster_std=1.,
                                random_state=42)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
        SpectralEmbedding(n_neighbors=n_neighbors,
                          affinity='precomputed',
                          random_state=42))
    est_compact = SpectralEmbedding(n_neighbors=n_neighbors,
                                    affinity='nearest_neighbors',
                                    random_state=42)
    St_compact = est_compact.fit_transform(S)
    St_chain = est_chain.fit_transform(S)
    assert_array_almost_equal(St_chain, St_compact)
Пример #6
0
def test_tsne():
    # Test chaining KNeighborsTransformer and TSNE
    n_iter = 250
    perplexity = 5
    n_neighbors = int(3. * perplexity + 1)

    rng = np.random.RandomState(0)
    X = rng.randn(20, 2)

    for metric in ['minkowski', 'sqeuclidean']:

        # compare the chained version and the compact version
        est_chain = make_pipeline(
            KNeighborsTransformer(n_neighbors=n_neighbors,
                                  mode='distance',
                                  metric=metric),
            TSNE(metric='precomputed',
                 perplexity=perplexity,
                 method="barnes_hut",
                 random_state=42,
                 n_iter=n_iter))
        est_compact = TSNE(metric=metric,
                           perplexity=perplexity,
                           n_iter=n_iter,
                           method="barnes_hut",
                           random_state=42)

        Xt_chain = est_chain.fit_transform(X)
        Xt_compact = est_compact.fit_transform(X)
        assert_array_almost_equal(Xt_chain, Xt_compact)
Пример #7
0
def get_kNN_score_torch(pairwise_distances, matching_matrix, n_neighbours=5):

    # The score shows how the collection of persistent landscapes, corresponding to each label,
    # are separeted from each other, in the sense of L2 distance in the Hilbert space of persistent landscape
    
    # pairwise_distances - torch tensor of the shape (n_samples, n_samples). 
    # mathcing_matrix - numpy array of the shape (n_samples, n_samples). 1 if samples have the same label, 0 otherwise
    # n_neighbours - integer, number of nearest used to calculate the score
    
    # returns kNN_score - real number between 0 and 1
    
    n_samples = pairwise_distances.size()[0]
    kNN_transformer = KNeighborsTransformer(mode='connectivity', metric='precomputed', n_neighbors=n_neighbours)
    connectivity_matrix = kNN_transformer.fit_transform(pairwise_distances.numpy()).toarray()
    #if(matching_matrix == 0):
    #    matching_matrix = labels.numpy()[:, np.newaxis] == labels.numpy()[np.newaxis, :]
    kNN_score = (np.sum(matching_matrix * connectivity_matrix) - n_samples) / (np.sum(connectivity_matrix) - n_samples)
    return kNN_score
Пример #8
0
def test_kneighbors_regressor():
    # Test chaining KNeighborsTransformer and classifiers/regressors
    rng = np.random.RandomState(0)
    X = 2 * rng.rand(40, 5) - 1
    X2 = 2 * rng.rand(40, 5) - 1
    y = rng.rand(40, 1)

    n_neighbors = 12
    radius = 1.5
    # We precompute more neighbors than necessary, to have equivalence between
    # k-neighbors estimator after radius-neighbors transformer, and vice-versa.
    factor = 2

    k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance')
    k_trans_factor = KNeighborsTransformer(n_neighbors=int(n_neighbors *
                                                           factor),
                                           mode='distance')

    r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance')
    r_trans_factor = RadiusNeighborsTransformer(radius=int(radius * factor),
                                                mode='distance')

    k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
    r_reg = RadiusNeighborsRegressor(radius=radius)

    test_list = [
        (k_trans, k_reg),
        (k_trans_factor, r_reg),
        (r_trans, r_reg),
        (r_trans_factor, k_reg),
    ]

    for trans, reg in test_list:
        # compare the chained version and the compact version
        reg_compact = clone(reg)
        reg_precomp = clone(reg)
        reg_precomp.set_params(metric='precomputed')

        reg_chain = make_pipeline(clone(trans), reg_precomp)

        y_pred_chain = reg_chain.fit(X, y).predict(X2)
        y_pred_compact = reg_compact.fit(X, y).predict(X2)
        assert_array_almost_equal(y_pred_chain, y_pred_compact)
Пример #9
0
def test_explicit_diagonal():
    # Test that the diagonal is explicitly stored in the sparse graph
    n_neighbors = 5
    n_samples_fit, n_samples_transform, n_features = 20, 18, 10
    rng = np.random.RandomState(42)
    X = rng.randn(n_samples_fit, n_features)
    X2 = rng.randn(n_samples_transform, n_features)

    nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
    Xt = nnt.fit_transform(X)
    assert _has_explicit_diagonal(Xt)
    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)

    Xt = nnt.transform(X)
    assert _has_explicit_diagonal(Xt)
    assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)

    # Using transform on new data should not always have zero diagonal
    X2t = nnt.transform(X2)
    assert not _has_explicit_diagonal(X2t)
Пример #10
0
def _calculate_pairwise_distances(X,
                                  Y=None,
                                  metric='precomputed',
                                  n_neighbors=None):
    if metric in ('precomputed', 'ignore'):
        return X

    if n_neighbors is None:
        if metric == 'euclidean':
            X_pairwise = pairwise_distances(X,
                                            Y=Y,
                                            metric=metric,
                                            squared=True)
        elif metric == 'correlation' or metric == 'cosine':
            # An in-place version of:
            # X_pairwise = 1 - (1 - pairwise_distances(X, metric=metric)) ** 2

            X_pairwise = pairwise_distances(X, Y=Y, metric=metric)
            X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise)
            X_pairwise = numpy.square(X_pairwise, out=X_pairwise)
            X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise)
        else:
            X_pairwise = pairwise_distances(X, Y=Y, metric=metric)
    else:
        if metric == 'correlation' or metric == 'cosine':
            # An in-place version of:
            # X = 1 - (1 - pairwise_distances(X, metric=metric)) ** 2

            X = pairwise_distances(X, Y=Y, metric=metric)
            X = numpy.subtract(1, X, out=X)
            X = numpy.square(X, out=X)
            X = numpy.subtract(1, X, out=X)
            metric = 'precomputed'

        if isinstance(n_neighbors, int):
            X_pairwise = KNeighborsTransformer(n_neighbors=n_neighbors,
                                               metric=metric).fit_transform(X)

        elif isinstance(n_neighbors, KNeighborsTransformer):
            X_pairwise = n_neighbors.fit_transform(X)

    if metric == 'correlation' or metric == 'cosine':
        if isinstance(X_pairwise, csr_matrix):
            X_pairwise.data = numpy.subtract(1,
                                             X_pairwise.data,
                                             out=X_pairwise.data)
        else:
            X_pairwise = numpy.subtract(1, X_pairwise, out=X_pairwise)
    else:
        if isinstance(X_pairwise, csr_matrix):
            X_pairwise.data = numpy.subtract(X_pairwise.max(),
                                             X_pairwise.data,
                                             out=X_pairwise.data)
        else:
            X_pairwise = numpy.subtract(X_pairwise.max(),
                                        X_pairwise,
                                        out=X_pairwise)

    return X_pairwise
 def test_sklearn_k_neighbours_transformer_connectivity(self):
     model, X_test = fit_classification_model(
         KNeighborsTransformer(n_neighbors=3, mode='connectivity'), 3)
     model_onnx = convert_sklearn(
         model,
         "KNN transformer",
         [("input", FloatTensorType((None, X_test.shape[1])))],
         target_opset=TARGET_OPSET)
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X_test,
                         model,
                         model_onnx,
                         basename="SklearnKNNTransformerConnectivity")
 def test_sklearn_k_neighbours_transformer_distance(self):
     model, X_test = fit_classification_model(
         KNeighborsTransformer(n_neighbors=4, mode='distance'), 2)
     model_onnx = convert_sklearn(
         model,
         "KNN transformer",
         [("input", FloatTensorType((None, X_test.shape[1])))],
     )
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(
         X_test,
         model,
         model_onnx,
         basename="SklearnKNNTransformerDistance",
     )
def test_spectral_clustering():
    # Test chaining KNeighborsTransformer and SpectralClustering
    n_neighbors = 5
    X, _ = make_blobs(random_state=0)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
        SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed',
                           random_state=42))
    est_compact = SpectralClustering(
        n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
    labels_compact = est_compact.fit_predict(X)
    labels_chain = est_chain.fit_predict(X)
    assert_array_almost_equal(labels_chain, labels_compact)
Пример #14
0
def test_lof():
    # Test chaining KNeighborsTransformer and LocalOutlierFactor
    n_neighbors = 4

    rng = np.random.RandomState(0)
    X = rng.randn(40, 2)

    # compare the chained version and the compact version
    est_chain = make_pipeline(
        KNeighborsTransformer(n_neighbors=n_neighbors + 1, mode='distance'),
        LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors))
    est_compact = LocalOutlierFactor(n_neighbors=n_neighbors)

    pred_chain = est_chain.fit_predict(X)
    pred_compact = est_compact.fit_predict(X)
    assert_array_almost_equal(pred_chain, pred_compact)
Пример #15
0
def train_knn():
    logging.info("Training KNN")
    latent_codes = []

    to_tensor = torchvision.transforms.ToTensor()

    for image in dataset:
        image = to_tensor(image).float()
        image = image.unsqueeze(dim=0)
        latent = ae_model.encode(image)
        latent = latent.detach().cpu()

        latent_codes.append(latent)

    latent_codes = np.vstack(latent_codes)

    global knn
    knn = KNeighborsTransformer().fit(latent_codes)
Пример #16
0
def test_transformer_result():
    # Test the number of neighbors returned
    n_neighbors = 5
    n_samples_fit = 20
    n_queries = 18
    n_features = 10

    rng = np.random.RandomState(42)
    X = rng.randn(n_samples_fit, n_features)
    X2 = rng.randn(n_queries, n_features)
    radius = np.percentile(euclidean_distances(X), 10)

    # with n_neighbors
    for mode in ["distance", "connectivity"]:
        add_one = mode == "distance"
        nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
        Xt = nnt.fit_transform(X)
        assert Xt.shape == (n_samples_fit, n_samples_fit)
        assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
        assert Xt.format == "csr"
        assert _is_sorted_by_data(Xt)

        X2t = nnt.transform(X2)
        assert X2t.shape == (n_queries, n_samples_fit)
        assert X2t.data.shape == (n_queries * (n_neighbors + add_one), )
        assert X2t.format == "csr"
        assert _is_sorted_by_data(X2t)

    # with radius
    for mode in ["distance", "connectivity"]:
        add_one = mode == "distance"
        nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
        Xt = nnt.fit_transform(X)
        assert Xt.shape == (n_samples_fit, n_samples_fit)
        assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
        assert Xt.format == "csr"
        assert _is_sorted_by_data(Xt)

        X2t = nnt.transform(X2)
        assert X2t.shape == (n_queries, n_samples_fit)
        assert not X2t.data.shape == (n_queries * (n_neighbors + add_one), )
        assert X2t.format == "csr"
        assert _is_sorted_by_data(X2t)
Пример #17
0
def convert2graph(components):
    knn = KNeighborsTransformer(n_neighbors=10, n_jobs=-1)
    graph = knn.fit_transform(components)
    G = nx.Graph(graph)
    return G
Пример #18
0
def run_benchmark():
    datasets = [
        ("MNIST_2000", load_mnist(n_samples=2000)),
        ("MNIST_10000", load_mnist(n_samples=10000)),
    ]

    n_iter = 500
    perplexity = 30
    metric = "euclidean"
    # TSNE requires a certain number of neighbors which depends on the
    # perplexity parameter.
    # Add one since we include each sample as its own neighbor.
    n_neighbors = int(3.0 * perplexity + 1) + 1

    tsne_params = dict(
        perplexity=perplexity,
        method="barnes_hut",
        random_state=42,
        n_iter=n_iter,
        square_distances=True,
    )

    transformers = [
        ("AnnoyTransformer",
         AnnoyTransformer(n_neighbors=n_neighbors, metric=metric)),
        (
            "NMSlibTransformer",
            NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
        ),
        (
            "KNeighborsTransformer",
            KNeighborsTransformer(n_neighbors=n_neighbors,
                                  mode="distance",
                                  metric=metric),
        ),
        (
            "TSNE with AnnoyTransformer",
            make_pipeline(
                AnnoyTransformer(n_neighbors=n_neighbors, metric=metric),
                TSNE(metric="precomputed", **tsne_params),
            ),
        ),
        (
            "TSNE with NMSlibTransformer",
            make_pipeline(
                NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
                TSNE(metric="precomputed", **tsne_params),
            ),
        ),
        (
            "TSNE with KNeighborsTransformer",
            make_pipeline(
                KNeighborsTransformer(n_neighbors=n_neighbors,
                                      mode="distance",
                                      metric=metric),
                TSNE(metric="precomputed", **tsne_params),
            ),
        ),
        ("TSNE with internal NearestNeighbors",
         TSNE(metric=metric, **tsne_params)),
    ]

    # init the plot
    nrows = len(datasets)
    ncols = np.sum([1 for name, model in transformers if "TSNE" in name])
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=ncols,
                             squeeze=False,
                             figsize=(5 * ncols, 4 * nrows))
    axes = axes.ravel()
    i_ax = 0

    for dataset_name, (X, y) in datasets:

        msg = "Benchmarking on %s:" % dataset_name
        print("\n%s\n%s" % (msg, "-" * len(msg)))

        for transformer_name, transformer in transformers:
            start = time.time()
            Xt = transformer.fit_transform(X)
            duration = time.time() - start

            # print the duration report
            longest = np.max([len(name) for name, model in transformers])
            whitespaces = " " * (longest - len(transformer_name))
            print("%s: %s%.3f sec" % (transformer_name, whitespaces, duration))

            # plot TSNE embedding which should be very similar across methods
            if "TSNE" in transformer_name:
                axes[i_ax].set_title(transformer_name + "\non " + dataset_name)
                axes[i_ax].scatter(
                    Xt[:, 0],
                    Xt[:, 1],
                    c=y.astype(np.int32),
                    alpha=0.2,
                    cmap=plt.cm.viridis,
                )
                axes[i_ax].xaxis.set_major_formatter(NullFormatter())
                axes[i_ax].yaxis.set_major_formatter(NullFormatter())
                axes[i_ax].axis("tight")
                i_ax += 1

    fig.tight_layout()
    plt.show()
# :class:`neighbors.KNeighborsTransformer` and
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
# can also be performed by custom estimators to use alternative
# implementations, such as approximate nearest neighbors methods.
# See more details in the :ref:`User Guide <neighbors_transformer>`.

from tempfile import TemporaryDirectory
from sklearn.neighbors import KNeighborsTransformer
from sklearn.manifold import Isomap
from sklearn.pipeline import make_pipeline

X, y = make_classification(random_state=0)

with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir:
    estimator = make_pipeline(
        KNeighborsTransformer(n_neighbors=10, mode="distance"),
        Isomap(n_neighbors=10, metric="precomputed"),
        memory=tmpdir,
    )
    estimator.fit(X)

    # We can decrease the number of neighbors and the graph will not be
    # recomputed.
    estimator.set_params(isomap__n_neighbors=5)
    estimator.fit(X)

# %%
# KNN Based Imputation
# ------------------------------------
# We now support imputation for completing missing values using k-Nearest
# Neighbors.
Пример #20
0
 def __init__(self):
     self.knn = KNeighborsTransformer(n_neighbors=5,n_jobs=-1)
Пример #21
0
def run_benchmark():
    datasets = [
        ('MNIST_2000', load_mnist(n_samples=2000)),
        ('MNIST_10000', load_mnist(n_samples=10000)),
    ]

    n_iter = 500
    perplexity = 30
    # TSNE requires a certain number of neighbors which depends on the
    # perplexity parameter.
    # Add one since we include each sample as its own neighbor.
    n_neighbors = int(3. * perplexity + 1) + 1

    transformers = [
        ('AnnoyTransformer',
         AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean')),
        ('NMSlibTransformer',
         NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean')),
        ('KNeighborsTransformer',
         KNeighborsTransformer(n_neighbors=n_neighbors,
                               mode='distance',
                               metric='sqeuclidean')),
        ('TSNE with AnnoyTransformer',
         make_pipeline(
             AnnoyTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
             TSNE(metric='precomputed',
                  perplexity=perplexity,
                  method="barnes_hut",
                  random_state=42,
                  n_iter=n_iter),
         )),
        ('TSNE with NMSlibTransformer',
         make_pipeline(
             NMSlibTransformer(n_neighbors=n_neighbors, metric='sqeuclidean'),
             TSNE(metric='precomputed',
                  perplexity=perplexity,
                  method="barnes_hut",
                  random_state=42,
                  n_iter=n_iter),
         )),
        ('TSNE with KNeighborsTransformer',
         make_pipeline(
             KNeighborsTransformer(n_neighbors=n_neighbors,
                                   mode='distance',
                                   metric='sqeuclidean'),
             TSNE(metric='precomputed',
                  perplexity=perplexity,
                  method="barnes_hut",
                  random_state=42,
                  n_iter=n_iter),
         )),
        ('TSNE with internal NearestNeighbors',
         TSNE(metric='sqeuclidean',
              perplexity=perplexity,
              method="barnes_hut",
              random_state=42,
              n_iter=n_iter)),
    ]

    # init the plot
    nrows = len(datasets)
    ncols = np.sum([1 for name, model in transformers if 'TSNE' in name])
    fig, axes = plt.subplots(nrows=nrows,
                             ncols=ncols,
                             squeeze=False,
                             figsize=(5 * ncols, 4 * nrows))
    axes = axes.ravel()
    i_ax = 0

    for dataset_name, (X, y) in datasets:

        msg = 'Benchmarking on %s:' % dataset_name
        print('\n%s\n%s' % (msg, '-' * len(msg)))

        for transformer_name, transformer in transformers:
            start = time.time()
            Xt = transformer.fit_transform(X)
            duration = time.time() - start

            # print the duration report
            longest = np.max([len(name) for name, model in transformers])
            whitespaces = ' ' * (longest - len(transformer_name))
            print('%s: %s%.3f sec' % (transformer_name, whitespaces, duration))

            # plot TSNE embedding which should be very similar across methods
            if 'TSNE' in transformer_name:
                axes[i_ax].set_title(transformer_name + '\non ' + dataset_name)
                axes[i_ax].scatter(Xt[:, 0],
                                   Xt[:, 1],
                                   c=y,
                                   alpha=0.2,
                                   cmap=plt.cm.viridis)
                axes[i_ax].xaxis.set_major_formatter(NullFormatter())
                axes[i_ax].yaxis.set_major_formatter(NullFormatter())
                axes[i_ax].axis('tight')
                i_ax += 1

    fig.tight_layout()
    plt.show()
Пример #22
0
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline

print(__doc__)

X, y = load_digits(return_X_y=True)
n_neighbors_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# The transformer computes the nearest neighbors graph using the maximum number
# of neighbors necessary in the grid search. The classifier model filters the
# nearest neighbors graph as required by its own n_neighbors parameter.
graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode="distance")
classifier_model = KNeighborsClassifier(metric="precomputed")

# Note that we give `memory` a directory to cache the graph computation
# that will be used several times when tuning the hyperparameters of the
# classifier.
with TemporaryDirectory(prefix="sklearn_graph_cache_") as tmpdir:
    full_model = Pipeline(
        steps=[("graph", graph_model), ("classifier", classifier_model)], memory=tmpdir
    )

    param_grid = {"classifier__n_neighbors": n_neighbors_list}
    grid_model = GridSearchCV(full_model, param_grid)
    grid_model.fit(X, y)

# Plot the results of the grid search.
Пример #23
0
        """A function to take a feature and tokenize then return a tfidf df of that input
        """
        self.tokenizer.fit_on_texts(feature)
        a = self.tokenizer.texts_to_matrix(feature, mode='tfidf')
        config = self.tokenizer.get_config()
        feature_names = json_normalize(loads(
            config['word_index'])).columns.tolist()
        dtm = pd.DataFrame(a)
        return dtm


if __name__ == "__main__":
    tr = Transformer()
    negative = ['negative']
    ignore = []
    user_transformed, y = tr.transform(
        pd.DataFrame({
            'name':
            "blue berry kush",
            'race':
            'sativa',
            'flavors': ['blueberry', 'sweet'],
            'negative': ['dry mouth', 'dry eyes'],
            'positive': ['creativity', 'stress'],
            'medical': ['ptsd', 'stress'],
            'description':
            "blueberry kush my dude blueberry_kush:10, whitewhidow:10 ",
        }), negative, ignore)
    model = KNeighborsTransformer()
    model.fit()
Пример #24
0
# To use this feature in a pipeline, one can use the `memory` parameter, along
# with one of the two new transformers,
# :class:`neighbors.KNeighborsTransformer` and
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
# can also be performed by custom estimators to use alternative
# implementations, such as approximate nearest neighbors methods.
# See more details in the :ref:`User Guide <neighbors_transformer>`.

from tempfile import TemporaryDirectory
from sklearn.neighbors import KNeighborsTransformer
from sklearn.manifold import Isomap
from sklearn.pipeline import make_pipeline

with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir:
    estimator = make_pipeline(
        KNeighborsTransformer(n_neighbors=10, mode='distance'),
        Isomap(n_neighbors=10, metric='precomputed'),
        memory=tmpdir)
    estimator.fit(X)

    # We can decrease the number of neighbors and the graph will not be
    # recomputed.
    estimator.set_params(isomap__n_neighbors=5)
    estimator.fit(X)

############################################################################
# Stacking Classifier and Regressor
# ---------------------------------
# :class:`~ensemble.StackingClassifier` and
# :class:`~ensemble.StackingRegressor`
# allow you to have a stack of estimators with a final classifier or
Пример #25
0
def weighted_knn(train_adata,
                 valid_adata,
                 label_key,
                 n_neighbors=50,
                 threshold=0.5,
                 pred_unknown=True):
    """Annotates ``valid_adata`` cells with a trained weighted KNN classifier on ``train_adata``.

        Parameters
        ----------
        train_adata: :class:`~anndata.AnnData`
            Annotated dataset to be used to train KNN classifier with ``label_key`` as the target variable.
        valid_adata: :class:`~anndata.AnnData`
            Annotated dataset to be used to validate KNN classifier.
        label_key: str
            Name of the column to be used as target variable (e.g. cell_type) in ``train_adata`` and ``valid_adata``.
        n_neighbors: int
            Number of nearest neighbors in KNN classifier.
        threshold: float
            Threshold of uncertainty used to annotating cells as "Unknown". cells with uncertainties upper than this
             value will be annotated as "Unknown".
        pred_unknown: bool
            ``True`` by default. Whether to annotate any cell as "unknown" or not. If `False`, will not use
            ``threshold`` and annotate each cell with the label which is the most common in its
            ``n_neighbors`` nearest cells.
    """
    print(
        f'Weighted KNN with n_neighbors = {n_neighbors} and threshold = {threshold} ... ',
        end='')
    k_neighbors_transformer = KNeighborsTransformer(n_neighbors=n_neighbors,
                                                    mode='distance',
                                                    algorithm='brute',
                                                    metric='euclidean',
                                                    n_jobs=-1)
    k_neighbors_transformer.fit(train_adata.X)

    y_train_labels = train_adata.obs[label_key].values
    y_valid_labels = valid_adata.obs[label_key].values

    top_k_distances, top_k_indices = k_neighbors_transformer.kneighbors(
        X=valid_adata.X)

    stds = np.std(top_k_distances, axis=1)
    stds = (2. / stds)**2
    stds = stds.reshape(-1, 1)

    top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds))

    weights = top_k_distances_tilda / np.sum(
        top_k_distances_tilda, axis=1, keepdims=True)

    uncertainties = []
    pred_labels = []
    for i in range(len(weights)):
        unique_labels = np.unique(y_train_labels[top_k_indices[i]])
        best_label, best_prob = None, 0.0
        for candidate_label in unique_labels:
            candidate_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                                     candidate_label].sum()
            if best_prob < candidate_prob:
                best_prob = candidate_prob
                best_label = candidate_label

        if pred_unknown:
            if best_prob >= threshold:
                pred_label = best_label
            else:
                pred_label = 'Unknown'
        else:
            pred_label = best_label

        if pred_label == y_valid_labels[i]:
            uncertainties.append(max(1 - best_prob, 0))
        else:
            true_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                                y_valid_labels[i]].sum()
            if true_prob > 0.5:
                pass
            uncertainties.append(max(1 - true_prob, 0))

        pred_labels.append(pred_label)

    pred_labels = np.array(pred_labels).reshape(-1, )
    uncertainties = np.array(uncertainties).reshape(-1, )

    labels_eval = pred_labels == y_valid_labels
    labels_eval = labels_eval.astype(object)

    n_correct = len(labels_eval[labels_eval == True])
    n_incorrect = len(labels_eval[labels_eval == False]) - len(
        labels_eval[pred_labels == 'Unknown'])
    n_unknown = len(labels_eval[pred_labels == 'Unknown'])

    labels_eval[labels_eval == True] = f'Correct'
    labels_eval[labels_eval == False] = f'InCorrect'
    labels_eval[pred_labels == 'Unknown'] = f'Unknown'

    valid_adata.obs['uncertainty'] = uncertainties
    valid_adata.obs[f'pred_{label_key}'] = pred_labels
    valid_adata.obs['evaluation'] = labels_eval

    print('finished!')
    print(f"Number of correctly classified samples: {n_correct}")
    print(f"Number of misclassified samples: {n_incorrect}")
    print(f"Number of samples classified as unknown: {n_unknown}")
# with one of the two new transformers,
# :class:`neighbors.KNeighborsTransformer` and
# :class:`neighbors.RadiusNeighborsTransformer`. The precomputation
# can also be performed by custom estimators to use alternative
# implementations, such as approximate nearest neighbors methods.
# See more details in the :ref:`User Guide <neighbors_transformer>`.

from tempfile import TemporaryDirectory
from sklearn.neighbors import KNeighborsTransformer
from sklearn.manifold import Isomap
from sklearn.pipeline import make_pipeline

X, y = make_classification(random_state=0)

with TemporaryDirectory(prefix="sklearn_cache_") as tmpdir:
    estimator = make_pipeline(KNeighborsTransformer(n_neighbors=10,
                                                    mode='distance'),
                              Isomap(n_neighbors=10, metric='precomputed'),
                              memory=tmpdir)
    estimator.fit(X)

    # We can decrease the number of neighbors and the graph will not be
    # recomputed.
    estimator.set_params(isomap__n_neighbors=5)
    estimator.fit(X)

# %%
# KNN Based Imputation
# ------------------------------------
# We now support imputation for completing missing values using k-Nearest
# Neighbors.
#
Пример #27
0
def weighted_knn(train_adata,
                 valid_adata,
                 label_key,
                 n_neighbors=50,
                 threshold=0.5,
                 pred_unknown=True,
                 return_uncertainty=True):
    """
    Taken from scnet:
    https://github.com/theislab/scarches/blob/e84cfa5cf361bb22fd70865cb1f398af72248684/scnet/utils.py
    """
    print(
        f'Weighted KNN with n_neighbors = {n_neighbors} and threshold = {threshold} ... ',
        end='')
    k_neighbors_transformer = KNeighborsTransformer(n_neighbors=n_neighbors,
                                                    mode='distance',
                                                    algorithm='brute',
                                                    metric='euclidean',
                                                    n_jobs=-1)
    train_adata = remove_sparsity(train_adata)
    valid_adata = remove_sparsity(valid_adata)

    k_neighbors_transformer.fit(train_adata.X)

    y_train_labels = train_adata.obs[label_key].values
    y_valid_labels = valid_adata.obs[label_key].values

    top_k_distances, top_k_indices = k_neighbors_transformer.kneighbors(
        X=valid_adata.X)

    stds = np.std(top_k_distances, axis=1)
    stds = (2. / stds)**2
    stds = stds.reshape(-1, 1)

    top_k_distances_tilda = np.exp(-np.true_divide(top_k_distances, stds))

    weights = top_k_distances_tilda / np.sum(
        top_k_distances_tilda, axis=1, keepdims=True)

    uncertainties = []
    pred_labels = []
    for i in range(len(weights)):
        # labels = y_train_labels[top_k_indices[i]]
        most_common_label, _ = Counter(
            y_train_labels[top_k_indices[i]]).most_common(n=1)[0]
        most_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                            most_common_label].sum()
        if pred_unknown:
            if most_prob >= threshold:
                pred_label = most_common_label
            else:
                pred_label = 'Unknown'
        else:
            pred_label = most_common_label

        if pred_label == y_valid_labels[i]:
            uncertainties.append(1 - most_prob)
        else:
            true_prob = weights[i, y_train_labels[top_k_indices[i]] ==
                                y_valid_labels[i]].sum()
            uncertainties.append(1 - true_prob)

        pred_labels.append(pred_label)

    pred_labels = np.array(pred_labels).reshape(-1, 1)
    uncertainties = np.array(uncertainties).reshape(-1, 1)

    print('finished!')
    if return_uncertainty:
        return pred_labels, uncertainties
    else:
        return pred_labels