예제 #1
0
def test_warm_start_validation():
    X, y = make_classification(
        n_samples=30,
        n_features=5,
        n_classes=4,
        n_redundant=0,
        n_informative=5,
        random_state=0,
    )

    nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
    nca.fit(X, y)

    X_less_features, y = make_classification(
        n_samples=30,
        n_features=4,
        n_classes=4,
        n_redundant=0,
        n_informative=4,
        random_state=0,
    )
    msg = (f"The new inputs dimensionality ({X_less_features.shape[1]}) "
           "does not match the input dimensionality of the previously learned "
           f"transformation ({nca.components_.shape[1]}).")
    with pytest.raises(ValueError, match=re.escape(msg)):
        nca.fit(X_less_features, y)
예제 #2
0
def test_no_verbose(capsys):
    # assert by default there is no output (verbose=0)
    nca = NeighborhoodComponentsAnalysis()
    nca.fit(iris_data, iris_target)
    out, _ = capsys.readouterr()
    # check output
    assert (out == '')
예제 #3
0
def test_n_components():
    rng = np.random.RandomState(42)
    X = np.arange(12).reshape(4, 3)
    y = [1, 1, 2, 2]

    init = rng.rand(X.shape[1] - 1, 3)

    # n_components = X.shape[1] != transformation.shape[0]
    n_components = X.shape[1]
    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
    assert_raise_message(
        ValueError, 'The preferred dimensionality of the '
        'projected space `n_components` ({}) does not match '
        'the output dimensionality of the given '
        'linear transformation `init` ({})!'.format(n_components,
                                                    init.shape[0]), nca.fit, X,
        y)

    # n_components > X.shape[1]
    n_components = X.shape[1] + 2
    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
    assert_raise_message(
        ValueError, 'The preferred dimensionality of the '
        'projected space `n_components` ({}) cannot '
        'be greater than the given data '
        'dimensionality ({})!'.format(n_components, X.shape[1]), nca.fit, X, y)

    # n_components < X.shape[1]
    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
    nca.fit(X, y)
예제 #4
0
def test_one_class():
    X = iris_data[iris_target == 0]
    y = iris_target[iris_target == 0]

    nca = NeighborhoodComponentsAnalysis(max_iter=30,
                                         n_components=X.shape[1],
                                         init='identity')
    nca.fit(X, y)
    assert_array_equal(X, nca.transform(X))
예제 #5
0
def test_parameters_valid_types(param, value):
    # check that no error is raised when parameters have numpy integer or
    # floating types.
    nca = NeighborhoodComponentsAnalysis(**{param: value})

    X = iris_data
    y = iris_target

    nca.fit(X, y)
예제 #6
0
파일: compare.py 프로젝트: vu-minh/hc-tsne
def run_nca(args):
    nca = NeighborhoodComponentsAnalysis(n_components=2,
                                         init=args.nca_init,
                                         max_iter=100,
                                         verbose=2,
                                         random_state=42)
    nca.fit(X_train, y_train)
    Z = nca.transform(X_train)
    Z_test = nca.transform(X_test)
    return Z, Z_test
예제 #7
0
def KPPVNCA(X_train, y_train, X_test, y_test, k):

    nca = NeighborhoodComponentsAnalysis()
    nca.fit(X_train, y_train)
    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(nca.transform(X_train), y_train)

    score = knn.score(nca.transform(X_test), y_test)

    return score
예제 #8
0
def test_warm_start_effectiveness():
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.

    nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
    nca_warm.fit(iris_data, iris_target)
    transformation_warm = nca_warm.components_
    nca_warm.max_iter = 1
    nca_warm.fit(iris_data, iris_target)
    transformation_warm_plus_one = nca_warm.components_

    nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
    nca_cold.fit(iris_data, iris_target)
    transformation_cold = nca_cold.components_
    nca_cold.max_iter = 1
    nca_cold.fit(iris_data, iris_target)
    transformation_cold_plus_one = nca_cold.components_

    diff_warm = np.sum(
        np.abs(transformation_warm_plus_one - transformation_warm))
    diff_cold = np.sum(
        np.abs(transformation_cold_plus_one - transformation_cold))
    assert diff_warm < 3.0, ("Transformer changed significantly after one "
                             "iteration even though it was warm-started.")

    assert diff_cold > diff_warm, ("Cold-started transformer changed less "
                                   "significantly than warm-started "
                                   "transformer after one iteration.")
예제 #9
0
def test_simple_example():
    """Test on a simple example.

    Puts four points in the input space where the opposite labels points are
    next to each other. After transform the samples from the same class
    should be next to each other.

    """
    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
    y = np.array([1, 0, 1, 0])
    nca = NeighborhoodComponentsAnalysis(
        n_components=2, init="identity", random_state=42
    )
    nca.fit(X, y)
    X_t = nca.transform(X)
    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1]))
예제 #10
0
def nca_clustering(X_train, X_test, y_train, y_test, parameters):
    nca = NeighborhoodComponentsAnalysis()
    initial_classifier_knn = KNeighborsClassifier(
        n_jobs=-1, n_neighbors=parameters["k"], metric=parameters["distance"])

    cputime_start_train = time.process_time()
    nca.fit(X_train, y_train)
    classifier = initial_classifier_knn.fit(nca.transform(X_train), y_train)
    cputime_end_train = time.process_time()

    cputime_start_test = time.process_time()
    y_pred = classifier.predict(nca.transform(X_test))
    cputime_end_test = time.process_time()

    accuracy = classifier.score(nca.transform(X_test), y_test)

    return accuracy, cputime_end_train - cputime_start_train, cputime_end_test - cputime_start_test
예제 #11
0
def test_verbose(init_name, capsys):
    # assert there is proper output when verbose = 1, for every initialization
    # except auto because auto will call one of the others
    rng = np.random.RandomState(42)
    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
    regexp_init = r"... done in \ *\d+\.\d{2}s"
    msgs = {
        "pca": "Finding principal components" + regexp_init,
        "lda": "Finding most discriminative components" + regexp_init,
    }
    if init_name == "precomputed":
        init = rng.randn(X.shape[1], X.shape[1])
    else:
        init = init_name
    nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
    nca.fit(X, y)
    out, _ = capsys.readouterr()

    # check output
    lines = re.split("\n+", out)
    # if pca or lda init, an additional line is printed, so we test
    # it and remove it to test the rest equally among initializations
    if init_name in ["pca", "lda"]:
        assert re.match(msgs[init_name], lines[0])
        lines = lines[1:]
    assert lines[0] == "[NeighborhoodComponentsAnalysis]"
    header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value",
                                           "Time(s)")
    assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header)
    assert lines[2] == ("[NeighborhoodComponentsAnalysis] {}".format(
        "-" * len(header)))
    for line in lines[3:-2]:
        # The following regex will match for instance:
        # '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
        assert re.match(
            r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e"
            r"[+|-]\d+\ *\d+\.\d{2}",
            line,
        )
    assert re.match(
        r"\[NeighborhoodComponentsAnalysis\] Training took\ *"
        r"\d+\.\d{2}s\.",
        lines[-2],
    )
    assert lines[-1] == ""
def _neighborhoodcomponentsanalysis(*,
                                    train,
                                    test,
                                    x_predict=None,
                                    metrics,
                                    n_components=None,
                                    init='auto',
                                    warm_start=False,
                                    max_iter=50,
                                    tol=1e-05,
                                    callback=None,
                                    verbose=0,
                                    random_state=None):
    """
    For more info visit :
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NeighborhoodComponentsAnalysis.html#sklearn.neighbors.NeighborhoodComponentsAnalysis
    """

    model = NeighborhoodComponentsAnalysis(n_components=n_components,
                                           init=init,
                                           warm_start=warm_start,
                                           max_iter=max_iter,
                                           tol=tol,
                                           callback=callback,
                                           verbose=verbose,
                                           random_state=random_state)
    model.fit(train[0], train[1])
    model_name = 'Neighborhood Components Analysis'
    y_hat = model.predict(test[0])

    if metrics == 'accuracy':
        accuracy = accuracy_score(test[1], y_hat)

    if metrics == 'f1':
        accuracy = f1_score(test[1], y_hat)

    if metrics == 'jaccard':
        accuracy = jaccard_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
def nca_clustering(X_train, X_test, y_train, y_test, parameters,
                   evaluation_metrics):
    # modify parameters to call the clustering algorithm with modified ones, this mainly purposes the distance parameter
    modified_parameters = prepare_parameters(parameters)

    nca = NeighborhoodComponentsAnalysis()

    if modified_parameters["distance"] != "mahalanobis":
        initial_classifier_knn = KNeighborsClassifier(
            n_jobs=-1,
            n_neighbors=modified_parameters["k"],
            metric=modified_parameters["distance"],
            p=modified_parameters["minkowski_p"])
    else:
        try:
            initial_classifier_knn = KNeighborsClassifier(
                n_jobs=-1,
                n_neighbors=modified_parameters["k"],
                metric=modified_parameters["distance"],
                p=modified_parameters["minkowski_p"],
                algorithm="brute",
                metric_params={"VI": np.linalg.inv(np.cov(X_train))})
        except np.linalg.LinAlgError:
            print_warning(
                f"[Generator & Evaluator] <Warning> Error happened while running NCA, setting distance to euclidean & running again ..."
            )
            initial_classifier_knn = KNeighborsClassifier(
                n_jobs=-1,
                n_neighbors=modified_parameters["k"],
                metric="euclidean",
                p=modified_parameters["minkowski_p"])

    nca.fit(X_train, y_train)
    classifier = initial_classifier_knn.fit(nca.transform(X_train), y_train)

    y_pred = classifier.predict(nca.transform(X_test))

    evaluation_metrics["accuracy"] = classifier.score(nca.transform(X_test),
                                                      y_test)

    return evaluation_metrics
예제 #14
0
def test_verbose(init_name, capsys):
    # assert there is proper output when verbose = 1, for every initialization
    # except auto because auto will call one of the others
    rng = np.random.RandomState(42)
    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
    regexp_init = r'... done in \ *\d+\.\d{2}s'
    msgs = {
        'pca': "Finding principal components" + regexp_init,
        'lda': "Finding most discriminative components" + regexp_init
    }
    if init_name == 'precomputed':
        init = rng.randn(X.shape[1], X.shape[1])
    else:
        init = init_name
    nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
    nca.fit(X, y)
    out, _ = capsys.readouterr()

    # check output
    lines = re.split('\n+', out)
    # if pca or lda init, an additional line is printed, so we test
    # it and remove it to test the rest equally among initializations
    if init_name in ['pca', 'lda']:
        assert re.match(msgs[init_name], lines[0])
        lines = lines[1:]
    assert lines[0] == '[NeighborhoodComponentsAnalysis]'
    header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value',
                                           'Time(s)')
    assert lines[1] == '[NeighborhoodComponentsAnalysis] {}'.format(header)
    assert lines[2] == ('[NeighborhoodComponentsAnalysis] {}'.format(
        '-' * len(header)))
    for line in lines[3:-2]:
        # The following regex will match for instance:
        # '[NeighborhoodComponentsAnalysis]  0    6.988936e+01   0.01'
        assert re.match(
            r'\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e'
            r'[+|-]\d+\ *\d+\.\d{2}', line)
    assert re.match(
        r'\[NeighborhoodComponentsAnalysis\] Training took\ *'
        r'\d+\.\d{2}s\.', lines[-2])
    assert lines[-1] == ''
예제 #15
0
def test_singleton_class():
    X = iris_data
    y = iris_target

    # one singleton class
    singleton_class = 1
    ind_singleton, = np.where(y == singleton_class)
    y[ind_singleton] = 2
    y[ind_singleton[0]] = singleton_class

    nca = NeighborhoodComponentsAnalysis(max_iter=30)
    nca.fit(X, y)

    # One non-singleton class
    ind_1, = np.where(y == 1)
    ind_2, = np.where(y == 2)
    y[ind_1] = 0
    y[ind_1[0]] = 1
    y[ind_2] = 0
    y[ind_2[0]] = 2

    nca = NeighborhoodComponentsAnalysis(max_iter=30)
    nca.fit(X, y)

    # Only singleton classes
    ind_0, = np.where(y == 0)
    ind_1, = np.where(y == 1)
    ind_2, = np.where(y == 2)
    X = X[[ind_0[0], ind_1[0], ind_2[0]]]
    y = y[[ind_0[0], ind_1[0], ind_2[0]]]

    nca = NeighborhoodComponentsAnalysis(init='identity', max_iter=30)
    nca.fit(X, y)
    assert_array_equal(X, nca.transform(X))
예제 #16
0
def test_n_components():
    rng = np.random.RandomState(42)
    X = np.arange(12).reshape(4, 3)
    y = [1, 1, 2, 2]

    init = rng.rand(X.shape[1] - 1, 3)

    # n_components = X.shape[1] != transformation.shape[0]
    n_components = X.shape[1]
    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
    msg = ("The preferred dimensionality of the projected space "
           f"`n_components` ({n_components}) does not match the output "
           "dimensionality of the given linear transformation "
           f"`init` ({init.shape[0]})!")
    with pytest.raises(ValueError, match=re.escape(msg)):
        nca.fit(X, y)

    # n_components > X.shape[1]
    n_components = X.shape[1] + 2
    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
    msg = ("The preferred dimensionality of the projected space "
           f"`n_components` ({n_components}) cannot be greater than "
           f"the given data dimensionality ({X.shape[1]})!")
    with pytest.raises(ValueError, match=re.escape(msg)):
        nca.fit(X, y)

    # n_components < X.shape[1]
    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
    nca.fit(X, y)
예제 #17
0
def test_callback(capsys):
    X = iris_data
    y = iris_target

    nca = NeighborhoodComponentsAnalysis(callback="my_cb")
    with pytest.raises(ValueError):
        nca.fit(X, y)

    max_iter = 10

    def my_cb(transformation, n_iter):
        assert transformation.shape == (iris_data.shape[1] ** 2,)
        rem_iter = max_iter - n_iter
        print("{} iterations remaining...".format(rem_iter))

    # assert that my_cb is called
    nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1)
    nca.fit(iris_data, iris_target)
    out, _ = capsys.readouterr()

    # check output
    assert "{} iterations remaining...".format(max_iter - 1) in out
예제 #18
0
def test_warm_start_validation():
    X, y = make_classification(n_samples=30,
                               n_features=5,
                               n_classes=4,
                               n_redundant=0,
                               n_informative=5,
                               random_state=0)

    nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
    nca.fit(X, y)

    X_less_features, y = make_classification(n_samples=30,
                                             n_features=4,
                                             n_classes=4,
                                             n_redundant=0,
                                             n_informative=4,
                                             random_state=0)
    assert_raise_message(
        ValueError, 'The new inputs dimensionality ({}) does not '
        'match the input dimensionality of the '
        'previously learned transformation ({}).'.format(
            X_less_features.shape[1], nca.components_.shape[1]), nca.fit,
        X_less_features, y)
예제 #19
0
def test_expected_transformation_shape():
    """Test that the transformation has the expected shape."""
    X = iris_data
    y = iris_target

    class TransformationStorer:
        def __init__(self, X, y):
            # Initialize a fake NCA and variables needed to call the loss
            # function:
            self.fake_nca = NeighborhoodComponentsAnalysis()
            self.fake_nca.n_iter_ = np.inf
            self.X, y, _ = self.fake_nca._validate_params(X, y)
            self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]

        def callback(self, transformation, n_iter):
            """Stores the last value of the transformation taken as input by
            the optimizer"""
            self.transformation = transformation

    transformation_storer = TransformationStorer(X, y)
    cb = transformation_storer.callback
    nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
    nca.fit(X, y)
    assert transformation_storer.transformation.size == X.shape[1]**2
예제 #20
0
                            axis=1)  # x为数据,y为标签
x_test, y_test = np.split(test1, indices_or_sections=(7680 * 2, ),
                          axis=1)  # x为数据,y为标签

x_train = x_train[:, ::2]
x_test = x_test[:, ::2]

nca = NeighborhoodComponentsAnalysis(random_state=42,
                                     n_components=100,
                                     init='pca')

std = MinMaxScaler()
x_train = std.fit_transform(x_train)
x_test = std.fit_transform(x_test)

nca.fit(x_train, y_train)
x_train1 = nca.transform(x_train)
x_test1 = nca.transform(x_test)

# imbalance
sm = SMOTE(random_state=0)
x_train1, y_train = sm.fit_resample(x_train1, y_train)

# # 3.训练KNN分类器
# C_list = [1]
# x = [['kernel', 'c', 'gamma', 'acc']]
# for C_index in range(len(C_list)):
#     classifier = KNeighborsClassifier(n_neighbors=C_list[C_index])
#     classifier.fit(x_train1, y_train.ravel())
#     score = balanced_accuracy_score(y_test, classifier.predict(x_test1))
#     aaa = [C_list[C_index], score]
예제 #21
0
val_test = []

for d in data:

    X_test.append(d[5])
    ar_test.append(d[2])
    val_test.append(d[3])

X_test = np.array(X_test)
ar_test = np.array(ar_test)
val_test = np.array(val_test)

#features selectrion
nca_ar = NeighborhoodComponentsAnalysis(random_state=0)
nca_val = NeighborhoodComponentsAnalysis(random_state=0)
nca_ar.fit(X_train, ar_train)
nca_val.fit(X_train, val_train)

X_train_ar = nca_ar.transform(X_train)
X_train_val = nca_val.transform(X_train)
X_test_ar = nca_ar.transform(X_test)
X_test_val = nca_val.transform(X_test)

# X_train_ar = X_train
# X_train_val = X_train
# X_test_ar = X_test
# X_test_val = X_test

parameters = {"n_estimators": [50, 75, 100], "learning_rate": [0.1, 0.5, 1.]}
reg_ar = AdaBoostRegressor(ExtraTreeRegressor(max_depth=5, random_state=0),
                           random_state=0)
예제 #22
0
# check data
df.info()
df.isnull().values.sum()

# neighbors
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.neighbors import NearestNeighbors

# mtx
X = df.iloc[:, 1:21].values
y = df['Target'].values

# demension reduction and classify
nca = NeighborhoodComponentsAnalysis(random_state=1234)
nca.fit(X, y)
X = nca.transform(X)

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X = mms.fit_transform(X)

# fit neighbors
# metrics minkowski p 2, 'cosine'
n_size = 50
nbrs = NearestNeighbors(n_neighbors=n_size, metric='minkowski', p=2).fit(X)

# Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X.
distances, indices = nbrs.kneighbors(X)

# Let's print out the indices of neighbors for each record in object X.
예제 #23
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import (KNeighborsClassifier,
                               NeighborhoodComponentsAnalysis)

n_neighbors = 3
# Load Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names
# Split into train/test
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size = 0.5, stratify = y, random_state = 42)
# Reduce dimension to 2 with NeighborhoodComponentAnalysis
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
X = nca.fit(X, y).transform(X)
예제 #24
0
XX_test = pca.transform(X_test)

re = neigh.predict(XX_test)
print("Test Accuracy: ",metrics.accuracy_score(y_test, re))
print("--- %s seconds to predict ---" % (time.time() - start_time))

"""# **Neighbourhood Component Analysis (NCA)**"""

# Accuracy vs # of Dimensions with k=4 neighbours
print('Plotting Accuracy vs Dimensions for k=4 neighbours')
nca_list = []
for i in range(1,20,1):
  print('Dimensions =', i)
  nca = NeighborhoodComponentsAnalysis(n_components=i,random_state=42, warm_start=True)
  nca.fit(X_train, y_train)
  neigh = KNeighborsClassifier(n_neighbors=4, weights='distance', algorithm='kd_tree')
  neigh.fit(nca.transform(X_train), y_train)
  re = neigh.predict(nca.transform(X_test))
  nca_list.append(metrics.accuracy_score(y_test, re))

plt.ylabel('Accuracy')
plt.xlabel('# of Dimensions')
plt.title('NCA+K-NN')
plt.plot(list(range(1,20,1)),nca_list)
plt.show()
print("Maximum accurcay is " + str(nca_list[np.argmax(np.array(nca_list))]) + 
      " with " + str(np.argmax(np.array(nca_list))+1) + " components.")

print("Plotting Accuracy vs # of neighbours with Dimensions = 5")
nca = NeighborhoodComponentsAnalysis(n_components=5, random_state=42, warm_start=True)
예제 #25
0
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')" %
          (len(np.unique(y)), grid_pca.best_estimator_.n_neighbors,
           grid_pca.best_estimator_.weights))

#%% NCA

nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
nca.fit(x_scaled, y)
X_reduced_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(X_reduced_nca, columns=["p1", "p2"])
nca_data["target"] = y
sns.scatterplot(x="p1", y="p2", hue="target", data=nca_data)
plt.title("NCA: p1 vs p2")

X_train_nca, X_test_nca, Y_train_nca, Y_test_nca = train_test_split(
    X_reduced_nca, y, test_size=test_size, random_state=42)

grid_nca = KNN_Best_Params(X_train_nca, X_test_nca, Y_train_nca, Y_test_nca)

# visualize
cmap_light = ListedColormap(['orange', 'cornflowerblue'])
cmap_bold = ListedColormap(['darkorange', 'darkblue'])
예제 #26
0
def test_init_transformation():
    rng = np.random.RandomState(42)
    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)

    # Start learning from scratch
    nca = NeighborhoodComponentsAnalysis(init='identity')
    nca.fit(X, y)

    # Initialize with random
    nca_random = NeighborhoodComponentsAnalysis(init='random')
    nca_random.fit(X, y)

    # Initialize with auto
    nca_auto = NeighborhoodComponentsAnalysis(init='auto')
    nca_auto.fit(X, y)

    # Initialize with PCA
    nca_pca = NeighborhoodComponentsAnalysis(init='pca')
    nca_pca.fit(X, y)

    # Initialize with LDA
    nca_lda = NeighborhoodComponentsAnalysis(init='lda')
    nca_lda.fit(X, y)

    init = rng.rand(X.shape[1], X.shape[1])
    nca = NeighborhoodComponentsAnalysis(init=init)
    nca.fit(X, y)

    # init.shape[1] must match X.shape[1]
    init = rng.rand(X.shape[1], X.shape[1] + 1)
    nca = NeighborhoodComponentsAnalysis(init=init)
    msg = (f"The input dimensionality ({init.shape[1]}) of the given "
           "linear transformation `init` must match the "
           f"dimensionality of the given inputs `X` ({X.shape[1]}).")
    with pytest.raises(ValueError, match=re.escape(msg)):
        nca.fit(X, y)

    # init.shape[0] must be <= init.shape[1]
    init = rng.rand(X.shape[1] + 1, X.shape[1])
    nca = NeighborhoodComponentsAnalysis(init=init)
    msg = (f"The output dimensionality ({init.shape[0]}) of the given "
           "linear transformation `init` cannot be "
           f"greater than its input dimensionality ({init.shape[1]}).")
    with pytest.raises(ValueError, match=re.escape(msg)):
        nca.fit(X, y)

    # init.shape[0] must match n_components
    init = rng.rand(X.shape[1], X.shape[1])
    n_components = X.shape[1] - 2
    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
    msg = ("The preferred dimensionality of the "
           f"projected space `n_components` ({n_components}) "
           "does not match the output dimensionality of the given "
           f"linear transformation `init` ({init.shape[0]})!")
    with pytest.raises(ValueError, match=re.escape(msg)):
        nca.fit(X, y)
                    linewidth=5*thickness[j])


# we consider only point 3
i = 3

# Plot bonds linked to sample i in the original space
relate_point(X, i, ax)
ax.set_title("Original points")
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.axis('equal')

# Learn an embedding with NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state)
nca = nca.fit(X, y)

# Plot the points after transformation with NeighborhoodComponentsAnalysis
plt.figure()
ax2 = plt.gca()

# Get the embedding and find the new nearest neighbors
X_embedded = nca.transform(X)

relate_point(X_embedded, i, ax2)

for i in range(len(X)):
    ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i),
             va='center', ha='center')
    ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[i]),
                alpha=0.4)
예제 #28
0
def test_convergence_warning():
    nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
    cls_name = nca.__class__.__name__
    msg = '[{}] NCA did not converge'.format(cls_name)
    with pytest.warns(ConvergenceWarning, match=re.escape(msg)):
        nca.fit(iris_data, iris_target)
예제 #29
0
def test_init_transformation():
    rng = np.random.RandomState(42)
    X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)

    # Start learning from scratch
    nca = NeighborhoodComponentsAnalysis(init='identity')
    nca.fit(X, y)

    # Initialize with random
    nca_random = NeighborhoodComponentsAnalysis(init='random')
    nca_random.fit(X, y)

    # Initialize with auto
    nca_auto = NeighborhoodComponentsAnalysis(init='auto')
    nca_auto.fit(X, y)

    # Initialize with PCA
    nca_pca = NeighborhoodComponentsAnalysis(init='pca')
    nca_pca.fit(X, y)

    # Initialize with LDA
    nca_lda = NeighborhoodComponentsAnalysis(init='lda')
    nca_lda.fit(X, y)

    init = rng.rand(X.shape[1], X.shape[1])
    nca = NeighborhoodComponentsAnalysis(init=init)
    nca.fit(X, y)

    # init.shape[1] must match X.shape[1]
    init = rng.rand(X.shape[1], X.shape[1] + 1)
    nca = NeighborhoodComponentsAnalysis(init=init)
    assert_raise_message(
        ValueError, 'The input dimensionality ({}) of the given '
        'linear transformation `init` must match the '
        'dimensionality of the given inputs `X` ({}).'.format(
            init.shape[1], X.shape[1]), nca.fit, X, y)

    # init.shape[0] must be <= init.shape[1]
    init = rng.rand(X.shape[1] + 1, X.shape[1])
    nca = NeighborhoodComponentsAnalysis(init=init)
    assert_raise_message(
        ValueError, 'The output dimensionality ({}) of the given '
        'linear transformation `init` cannot be '
        'greater than its input dimensionality ({}).'.format(
            init.shape[0], init.shape[1]), nca.fit, X, y)

    # init.shape[0] must match n_components
    init = rng.rand(X.shape[1], X.shape[1])
    n_components = X.shape[1] - 2
    nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
    assert_raise_message(
        ValueError, 'The preferred dimensionality of the '
        'projected space `n_components` ({}) does not match '
        'the output dimensionality of the given '
        'linear transformation `init` ({})!'.format(n_components,
                                                    init.shape[0]), nca.fit, X,
        y)
예제 #30
0
            ax.plot(*line, c=cm.Set1(y[j]), linewidth=5 * thickness[j])


# we consider only point 3
i = 3

# Plot bonds linked to sample i in the original space
relate_point(X, i, ax)
ax.set_title("Original points")
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.axis('equal')

# Learn an embedding with NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state)
nca = nca.fit(X, y)

# Plot the points after transformation with NeighborhoodComponentsAnalysis
plt.figure()
ax2 = plt.gca()

# Get the embedding and find the new nearest neighbors
X_embedded = nca.transform(X)

relate_point(X_embedded, i, ax2)

for i in range(len(X)):
    ax2.text(X_embedded[i, 0],
             X_embedded[i, 1],
             str(i),
             va='center',
예제 #31
0
def learn_spect_proj(X, y=None, spectral_proj_name='pca',
                     clustering_meth='KMeans',
                     clustering_options=CLUSTERING_OPTIONS,
                     kwargs_feat=None,
                     kwargs_clust=None):
    """
    Function to learn each of the important spectral projection

    :param X: the fvs, an array of size n*k
    :param y: the classes, an array of size n
    :param spectral_proj_name: a string of the name of the featurizer
    :param args: extra argument to be passed to the featurizer class
    :return: a matrix in the form of a numpy array
    """

    clustering_options = set(clustering_options)
    kwargs_feat = kwargs_feat or {'n_components': 10}
    kwargs_clust = kwargs_clust or {}

    assert clustering_meth in clustering_options, 'clustering options must one of {}'.format(
        ', '.join(map(str, clustering_options)))
    clusterer_m = getattr(importlib.import_module('sklearn.cluster'), clustering_meth)

    if spectral_proj_name == 'keep_features':
        indices = kwargs_feat['indices']
        proj_matrix = np.zeros((X.shape[1], len(indices)))
        for idx in range(len(indices)):
            proj_matrix[indices[idx], idx] = 1

    elif spectral_proj_name == 'pca':
        pca = PCA(**kwargs_feat)
        pca.fit(X)
        proj_matrix = pca.components_.T

    # elif spectral_proj_name == 'pseudo_pca':
    #     # make the pseudo pca proj matrix
    #     ppca = PseudoPca(**kwargs_feat)
    #     ppca.fit(X)
    #     proj_matrix = ppca.proj_mat.T

    elif spectral_proj_name == 'lda':
        lda = LDA(**kwargs_feat)
        lda.fit(X, y)
        n_components = kwargs_feat['n_components']
        proj_matrix = lda.scalings_[:, :n_components]

    elif spectral_proj_name == 'unsupervised_lda':
        n_components = kwargs_feat['n_components']
        if y is not None:
            print('y will be replaced by classes found by the chosen clusterer')
        if 'n_clusters' in clusterer_m.__init__.__code__.co_varnames:
            y = clusterer_m(n_clusters=n_components + 1, **kwargs_clust).fit_predict(X)
        else:
            y = clusterer_m(**kwargs_clust).fit_predict(X)
        lda = LDA(**kwargs_feat)
        lda.fit(X, y)
        proj_matrix = lda.scalings_[:, :n_components]

    elif spectral_proj_name == 'nca':
        nca = NCA(**kwargs_feat)
        nca.fit(X, y)
        proj_matrix = nca.components_.T

    elif spectral_proj_name == 'unsupervised_nca':
        n_components = kwargs_feat['n_components']
        if y is not None:
            print('y will be replaced by classes found by the chosen clusterer')
        if 'n_clusters' in clusterer_m.__init__.__code__.co_varnames:
            y = clusterer_m(n_clusters=n_components + 1, **kwargs_clust).fit_predict(X)
        else:
            y = clusterer_m(**kwargs_clust).fit_predict(X)
        nca = NCA(**kwargs_feat)
        nca.fit(X, y)
        proj_matrix = nca.components_.T

    elif spectral_proj_name == 'linear regression':
        lr = LinearRegression(**kwargs_feat)
        lr.fit(X, y)
        proj_matrix = lr.coef_.T

    else:
        all_spectral_proj = ', '.join(['keep_features', 'pca',
                                       'lda', 'pseudo_pca',
                                       'unsupervised_lda',
                                       'unsupervised_nca',
                                       'nca',
                                       'linear regression'])
        raise ValueError(f'the spectral projector must be one of: {all_spectral_proj}')

    return proj_matrix