def test_warm_start_validation(): X, y = make_classification( n_samples=30, n_features=5, n_classes=4, n_redundant=0, n_informative=5, random_state=0, ) nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5) nca.fit(X, y) X_less_features, y = make_classification( n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0, ) msg = (f"The new inputs dimensionality ({X_less_features.shape[1]}) " "does not match the input dimensionality of the previously learned " f"transformation ({nca.components_.shape[1]}).") with pytest.raises(ValueError, match=re.escape(msg)): nca.fit(X_less_features, y)
def test_no_verbose(capsys): # assert by default there is no output (verbose=0) nca = NeighborhoodComponentsAnalysis() nca.fit(iris_data, iris_target) out, _ = capsys.readouterr() # check output assert (out == '')
def test_n_components(): rng = np.random.RandomState(42) X = np.arange(12).reshape(4, 3) y = [1, 1, 2, 2] init = rng.rand(X.shape[1] - 1, 3) # n_components = X.shape[1] != transformation.shape[0] n_components = X.shape[1] nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred dimensionality of the ' 'projected space `n_components` ({}) does not match ' 'the output dimensionality of the given ' 'linear transformation `init` ({})!'.format(n_components, init.shape[0]), nca.fit, X, y) # n_components > X.shape[1] n_components = X.shape[1] + 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred dimensionality of the ' 'projected space `n_components` ({}) cannot ' 'be greater than the given data ' 'dimensionality ({})!'.format(n_components, X.shape[1]), nca.fit, X, y) # n_components < X.shape[1] nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity') nca.fit(X, y)
def test_one_class(): X = iris_data[iris_target == 0] y = iris_target[iris_target == 0] nca = NeighborhoodComponentsAnalysis(max_iter=30, n_components=X.shape[1], init='identity') nca.fit(X, y) assert_array_equal(X, nca.transform(X))
def test_parameters_valid_types(param, value): # check that no error is raised when parameters have numpy integer or # floating types. nca = NeighborhoodComponentsAnalysis(**{param: value}) X = iris_data y = iris_target nca.fit(X, y)
def run_nca(args): nca = NeighborhoodComponentsAnalysis(n_components=2, init=args.nca_init, max_iter=100, verbose=2, random_state=42) nca.fit(X_train, y_train) Z = nca.transform(X_train) Z_test = nca.transform(X_test) return Z, Z_test
def KPPVNCA(X_train, y_train, X_test, y_test, k): nca = NeighborhoodComponentsAnalysis() nca.fit(X_train, y_train) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(nca.transform(X_train), y_train) score = knn.score(nca.transform(X_test), y_test) return score
def test_warm_start_effectiveness(): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0) nca_warm.fit(iris_data, iris_target) transformation_warm = nca_warm.components_ nca_warm.max_iter = 1 nca_warm.fit(iris_data, iris_target) transformation_warm_plus_one = nca_warm.components_ nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0) nca_cold.fit(iris_data, iris_target) transformation_cold = nca_cold.components_ nca_cold.max_iter = 1 nca_cold.fit(iris_data, iris_target) transformation_cold_plus_one = nca_cold.components_ diff_warm = np.sum( np.abs(transformation_warm_plus_one - transformation_warm)) diff_cold = np.sum( np.abs(transformation_cold_plus_one - transformation_cold)) assert diff_warm < 3.0, ("Transformer changed significantly after one " "iteration even though it was warm-started.") assert diff_cold > diff_warm, ("Cold-started transformer changed less " "significantly than warm-started " "transformer after one iteration.")
def test_simple_example(): """Test on a simple example. Puts four points in the input space where the opposite labels points are next to each other. After transform the samples from the same class should be next to each other. """ X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]]) y = np.array([1, 0, 1, 0]) nca = NeighborhoodComponentsAnalysis( n_components=2, init="identity", random_state=42 ) nca.fit(X, y) X_t = nca.transform(X) assert_array_equal(pairwise_distances(X_t).argsort()[:, 1], np.array([2, 3, 0, 1]))
def nca_clustering(X_train, X_test, y_train, y_test, parameters): nca = NeighborhoodComponentsAnalysis() initial_classifier_knn = KNeighborsClassifier( n_jobs=-1, n_neighbors=parameters["k"], metric=parameters["distance"]) cputime_start_train = time.process_time() nca.fit(X_train, y_train) classifier = initial_classifier_knn.fit(nca.transform(X_train), y_train) cputime_end_train = time.process_time() cputime_start_test = time.process_time() y_pred = classifier.predict(nca.transform(X_test)) cputime_end_test = time.process_time() accuracy = classifier.score(nca.transform(X_test), y_test) return accuracy, cputime_end_train - cputime_start_train, cputime_end_test - cputime_start_test
def test_verbose(init_name, capsys): # assert there is proper output when verbose = 1, for every initialization # except auto because auto will call one of the others rng = np.random.RandomState(42) X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) regexp_init = r"... done in \ *\d+\.\d{2}s" msgs = { "pca": "Finding principal components" + regexp_init, "lda": "Finding most discriminative components" + regexp_init, } if init_name == "precomputed": init = rng.randn(X.shape[1], X.shape[1]) else: init = init_name nca = NeighborhoodComponentsAnalysis(verbose=1, init=init) nca.fit(X, y) out, _ = capsys.readouterr() # check output lines = re.split("\n+", out) # if pca or lda init, an additional line is printed, so we test # it and remove it to test the rest equally among initializations if init_name in ["pca", "lda"]: assert re.match(msgs[init_name], lines[0]) lines = lines[1:] assert lines[0] == "[NeighborhoodComponentsAnalysis]" header = "{:>10} {:>20} {:>10}".format("Iteration", "Objective Value", "Time(s)") assert lines[1] == "[NeighborhoodComponentsAnalysis] {}".format(header) assert lines[2] == ("[NeighborhoodComponentsAnalysis] {}".format( "-" * len(header))) for line in lines[3:-2]: # The following regex will match for instance: # '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01' assert re.match( r"\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e" r"[+|-]\d+\ *\d+\.\d{2}", line, ) assert re.match( r"\[NeighborhoodComponentsAnalysis\] Training took\ *" r"\d+\.\d{2}s\.", lines[-2], ) assert lines[-1] == ""
def _neighborhoodcomponentsanalysis(*, train, test, x_predict=None, metrics, n_components=None, init='auto', warm_start=False, max_iter=50, tol=1e-05, callback=None, verbose=0, random_state=None): """ For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NeighborhoodComponentsAnalysis.html#sklearn.neighbors.NeighborhoodComponentsAnalysis """ model = NeighborhoodComponentsAnalysis(n_components=n_components, init=init, warm_start=warm_start, max_iter=max_iter, tol=tol, callback=callback, verbose=verbose, random_state=random_state) model.fit(train[0], train[1]) model_name = 'Neighborhood Components Analysis' y_hat = model.predict(test[0]) if metrics == 'accuracy': accuracy = accuracy_score(test[1], y_hat) if metrics == 'f1': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard': accuracy = jaccard_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def nca_clustering(X_train, X_test, y_train, y_test, parameters, evaluation_metrics): # modify parameters to call the clustering algorithm with modified ones, this mainly purposes the distance parameter modified_parameters = prepare_parameters(parameters) nca = NeighborhoodComponentsAnalysis() if modified_parameters["distance"] != "mahalanobis": initial_classifier_knn = KNeighborsClassifier( n_jobs=-1, n_neighbors=modified_parameters["k"], metric=modified_parameters["distance"], p=modified_parameters["minkowski_p"]) else: try: initial_classifier_knn = KNeighborsClassifier( n_jobs=-1, n_neighbors=modified_parameters["k"], metric=modified_parameters["distance"], p=modified_parameters["minkowski_p"], algorithm="brute", metric_params={"VI": np.linalg.inv(np.cov(X_train))}) except np.linalg.LinAlgError: print_warning( f"[Generator & Evaluator] <Warning> Error happened while running NCA, setting distance to euclidean & running again ..." ) initial_classifier_knn = KNeighborsClassifier( n_jobs=-1, n_neighbors=modified_parameters["k"], metric="euclidean", p=modified_parameters["minkowski_p"]) nca.fit(X_train, y_train) classifier = initial_classifier_knn.fit(nca.transform(X_train), y_train) y_pred = classifier.predict(nca.transform(X_test)) evaluation_metrics["accuracy"] = classifier.score(nca.transform(X_test), y_test) return evaluation_metrics
def test_verbose(init_name, capsys): # assert there is proper output when verbose = 1, for every initialization # except auto because auto will call one of the others rng = np.random.RandomState(42) X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) regexp_init = r'... done in \ *\d+\.\d{2}s' msgs = { 'pca': "Finding principal components" + regexp_init, 'lda': "Finding most discriminative components" + regexp_init } if init_name == 'precomputed': init = rng.randn(X.shape[1], X.shape[1]) else: init = init_name nca = NeighborhoodComponentsAnalysis(verbose=1, init=init) nca.fit(X, y) out, _ = capsys.readouterr() # check output lines = re.split('\n+', out) # if pca or lda init, an additional line is printed, so we test # it and remove it to test the rest equally among initializations if init_name in ['pca', 'lda']: assert re.match(msgs[init_name], lines[0]) lines = lines[1:] assert lines[0] == '[NeighborhoodComponentsAnalysis]' header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value', 'Time(s)') assert lines[1] == '[NeighborhoodComponentsAnalysis] {}'.format(header) assert lines[2] == ('[NeighborhoodComponentsAnalysis] {}'.format( '-' * len(header))) for line in lines[3:-2]: # The following regex will match for instance: # '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01' assert re.match( r'\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e' r'[+|-]\d+\ *\d+\.\d{2}', line) assert re.match( r'\[NeighborhoodComponentsAnalysis\] Training took\ *' r'\d+\.\d{2}s\.', lines[-2]) assert lines[-1] == ''
def test_singleton_class(): X = iris_data y = iris_target # one singleton class singleton_class = 1 ind_singleton, = np.where(y == singleton_class) y[ind_singleton] = 2 y[ind_singleton[0]] = singleton_class nca = NeighborhoodComponentsAnalysis(max_iter=30) nca.fit(X, y) # One non-singleton class ind_1, = np.where(y == 1) ind_2, = np.where(y == 2) y[ind_1] = 0 y[ind_1[0]] = 1 y[ind_2] = 0 y[ind_2[0]] = 2 nca = NeighborhoodComponentsAnalysis(max_iter=30) nca.fit(X, y) # Only singleton classes ind_0, = np.where(y == 0) ind_1, = np.where(y == 1) ind_2, = np.where(y == 2) X = X[[ind_0[0], ind_1[0], ind_2[0]]] y = y[[ind_0[0], ind_1[0], ind_2[0]]] nca = NeighborhoodComponentsAnalysis(init='identity', max_iter=30) nca.fit(X, y) assert_array_equal(X, nca.transform(X))
def test_n_components(): rng = np.random.RandomState(42) X = np.arange(12).reshape(4, 3) y = [1, 1, 2, 2] init = rng.rand(X.shape[1] - 1, 3) # n_components = X.shape[1] != transformation.shape[0] n_components = X.shape[1] nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) msg = ("The preferred dimensionality of the projected space " f"`n_components` ({n_components}) does not match the output " "dimensionality of the given linear transformation " f"`init` ({init.shape[0]})!") with pytest.raises(ValueError, match=re.escape(msg)): nca.fit(X, y) # n_components > X.shape[1] n_components = X.shape[1] + 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) msg = ("The preferred dimensionality of the projected space " f"`n_components` ({n_components}) cannot be greater than " f"the given data dimensionality ({X.shape[1]})!") with pytest.raises(ValueError, match=re.escape(msg)): nca.fit(X, y) # n_components < X.shape[1] nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity') nca.fit(X, y)
def test_callback(capsys): X = iris_data y = iris_target nca = NeighborhoodComponentsAnalysis(callback="my_cb") with pytest.raises(ValueError): nca.fit(X, y) max_iter = 10 def my_cb(transformation, n_iter): assert transformation.shape == (iris_data.shape[1] ** 2,) rem_iter = max_iter - n_iter print("{} iterations remaining...".format(rem_iter)) # assert that my_cb is called nca = NeighborhoodComponentsAnalysis(max_iter=max_iter, callback=my_cb, verbose=1) nca.fit(iris_data, iris_target) out, _ = capsys.readouterr() # check output assert "{} iterations remaining...".format(max_iter - 1) in out
def test_warm_start_validation(): X, y = make_classification(n_samples=30, n_features=5, n_classes=4, n_redundant=0, n_informative=5, random_state=0) nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5) nca.fit(X, y) X_less_features, y = make_classification(n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0) assert_raise_message( ValueError, 'The new inputs dimensionality ({}) does not ' 'match the input dimensionality of the ' 'previously learned transformation ({}).'.format( X_less_features.shape[1], nca.components_.shape[1]), nca.fit, X_less_features, y)
def test_expected_transformation_shape(): """Test that the transformation has the expected shape.""" X = iris_data y = iris_target class TransformationStorer: def __init__(self, X, y): # Initialize a fake NCA and variables needed to call the loss # function: self.fake_nca = NeighborhoodComponentsAnalysis() self.fake_nca.n_iter_ = np.inf self.X, y, _ = self.fake_nca._validate_params(X, y) self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :] def callback(self, transformation, n_iter): """Stores the last value of the transformation taken as input by the optimizer""" self.transformation = transformation transformation_storer = TransformationStorer(X, y) cb = transformation_storer.callback nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb) nca.fit(X, y) assert transformation_storer.transformation.size == X.shape[1]**2
axis=1) # x为数据,y为标签 x_test, y_test = np.split(test1, indices_or_sections=(7680 * 2, ), axis=1) # x为数据,y为标签 x_train = x_train[:, ::2] x_test = x_test[:, ::2] nca = NeighborhoodComponentsAnalysis(random_state=42, n_components=100, init='pca') std = MinMaxScaler() x_train = std.fit_transform(x_train) x_test = std.fit_transform(x_test) nca.fit(x_train, y_train) x_train1 = nca.transform(x_train) x_test1 = nca.transform(x_test) # imbalance sm = SMOTE(random_state=0) x_train1, y_train = sm.fit_resample(x_train1, y_train) # # 3.训练KNN分类器 # C_list = [1] # x = [['kernel', 'c', 'gamma', 'acc']] # for C_index in range(len(C_list)): # classifier = KNeighborsClassifier(n_neighbors=C_list[C_index]) # classifier.fit(x_train1, y_train.ravel()) # score = balanced_accuracy_score(y_test, classifier.predict(x_test1)) # aaa = [C_list[C_index], score]
val_test = [] for d in data: X_test.append(d[5]) ar_test.append(d[2]) val_test.append(d[3]) X_test = np.array(X_test) ar_test = np.array(ar_test) val_test = np.array(val_test) #features selectrion nca_ar = NeighborhoodComponentsAnalysis(random_state=0) nca_val = NeighborhoodComponentsAnalysis(random_state=0) nca_ar.fit(X_train, ar_train) nca_val.fit(X_train, val_train) X_train_ar = nca_ar.transform(X_train) X_train_val = nca_val.transform(X_train) X_test_ar = nca_ar.transform(X_test) X_test_val = nca_val.transform(X_test) # X_train_ar = X_train # X_train_val = X_train # X_test_ar = X_test # X_test_val = X_test parameters = {"n_estimators": [50, 75, 100], "learning_rate": [0.1, 0.5, 1.]} reg_ar = AdaBoostRegressor(ExtraTreeRegressor(max_depth=5, random_state=0), random_state=0)
# check data df.info() df.isnull().values.sum() # neighbors from sklearn.neighbors import NeighborhoodComponentsAnalysis from sklearn.neighbors import NearestNeighbors # mtx X = df.iloc[:, 1:21].values y = df['Target'].values # demension reduction and classify nca = NeighborhoodComponentsAnalysis(random_state=1234) nca.fit(X, y) X = nca.transform(X) from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() X = mms.fit_transform(X) # fit neighbors # metrics minkowski p 2, 'cosine' n_size = 50 nbrs = NearestNeighbors(n_neighbors=n_size, metric='minkowski', p=2).fit(X) # Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X. distances, indices = nbrs.kneighbors(X) # Let's print out the indices of neighbors for each record in object X.
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.neighbors import (KNeighborsClassifier, NeighborhoodComponentsAnalysis) n_neighbors = 3 # Load Iris dataset iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names # Split into train/test X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size = 0.5, stratify = y, random_state = 42) # Reduce dimension to 2 with NeighborhoodComponentAnalysis nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42) X = nca.fit(X, y).transform(X)
XX_test = pca.transform(X_test) re = neigh.predict(XX_test) print("Test Accuracy: ",metrics.accuracy_score(y_test, re)) print("--- %s seconds to predict ---" % (time.time() - start_time)) """# **Neighbourhood Component Analysis (NCA)**""" # Accuracy vs # of Dimensions with k=4 neighbours print('Plotting Accuracy vs Dimensions for k=4 neighbours') nca_list = [] for i in range(1,20,1): print('Dimensions =', i) nca = NeighborhoodComponentsAnalysis(n_components=i,random_state=42, warm_start=True) nca.fit(X_train, y_train) neigh = KNeighborsClassifier(n_neighbors=4, weights='distance', algorithm='kd_tree') neigh.fit(nca.transform(X_train), y_train) re = neigh.predict(nca.transform(X_test)) nca_list.append(metrics.accuracy_score(y_test, re)) plt.ylabel('Accuracy') plt.xlabel('# of Dimensions') plt.title('NCA+K-NN') plt.plot(list(range(1,20,1)),nca_list) plt.show() print("Maximum accurcay is " + str(nca_list[np.argmax(np.array(nca_list))]) + " with " + str(np.argmax(np.array(nca_list))+1) + " components.") print("Plotting Accuracy vs # of neighbours with Dimensions = 5") nca = NeighborhoodComponentsAnalysis(n_components=5, random_state=42, warm_start=True)
Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title("%i-Class classification (k = %i, weights = '%s')" % (len(np.unique(y)), grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights)) #%% NCA nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42) nca.fit(x_scaled, y) X_reduced_nca = nca.transform(x_scaled) nca_data = pd.DataFrame(X_reduced_nca, columns=["p1", "p2"]) nca_data["target"] = y sns.scatterplot(x="p1", y="p2", hue="target", data=nca_data) plt.title("NCA: p1 vs p2") X_train_nca, X_test_nca, Y_train_nca, Y_test_nca = train_test_split( X_reduced_nca, y, test_size=test_size, random_state=42) grid_nca = KNN_Best_Params(X_train_nca, X_test_nca, Y_train_nca, Y_test_nca) # visualize cmap_light = ListedColormap(['orange', 'cornflowerblue']) cmap_bold = ListedColormap(['darkorange', 'darkblue'])
def test_init_transformation(): rng = np.random.RandomState(42) X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) # Start learning from scratch nca = NeighborhoodComponentsAnalysis(init='identity') nca.fit(X, y) # Initialize with random nca_random = NeighborhoodComponentsAnalysis(init='random') nca_random.fit(X, y) # Initialize with auto nca_auto = NeighborhoodComponentsAnalysis(init='auto') nca_auto.fit(X, y) # Initialize with PCA nca_pca = NeighborhoodComponentsAnalysis(init='pca') nca_pca.fit(X, y) # Initialize with LDA nca_lda = NeighborhoodComponentsAnalysis(init='lda') nca_lda.fit(X, y) init = rng.rand(X.shape[1], X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) nca.fit(X, y) # init.shape[1] must match X.shape[1] init = rng.rand(X.shape[1], X.shape[1] + 1) nca = NeighborhoodComponentsAnalysis(init=init) msg = (f"The input dimensionality ({init.shape[1]}) of the given " "linear transformation `init` must match the " f"dimensionality of the given inputs `X` ({X.shape[1]}).") with pytest.raises(ValueError, match=re.escape(msg)): nca.fit(X, y) # init.shape[0] must be <= init.shape[1] init = rng.rand(X.shape[1] + 1, X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) msg = (f"The output dimensionality ({init.shape[0]}) of the given " "linear transformation `init` cannot be " f"greater than its input dimensionality ({init.shape[1]}).") with pytest.raises(ValueError, match=re.escape(msg)): nca.fit(X, y) # init.shape[0] must match n_components init = rng.rand(X.shape[1], X.shape[1]) n_components = X.shape[1] - 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) msg = ("The preferred dimensionality of the " f"projected space `n_components` ({n_components}) " "does not match the output dimensionality of the given " f"linear transformation `init` ({init.shape[0]})!") with pytest.raises(ValueError, match=re.escape(msg)): nca.fit(X, y)
linewidth=5*thickness[j]) # we consider only point 3 i = 3 # Plot bonds linked to sample i in the original space relate_point(X, i, ax) ax.set_title("Original points") ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) ax.axis('equal') # Learn an embedding with NeighborhoodComponentsAnalysis nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state) nca = nca.fit(X, y) # Plot the points after transformation with NeighborhoodComponentsAnalysis plt.figure() ax2 = plt.gca() # Get the embedding and find the new nearest neighbors X_embedded = nca.transform(X) relate_point(X_embedded, i, ax2) for i in range(len(X)): ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i), va='center', ha='center') ax2.scatter(X_embedded[i, 0], X_embedded[i, 1], s=300, c=cm.Set1(y[i]), alpha=0.4)
def test_convergence_warning(): nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1) cls_name = nca.__class__.__name__ msg = '[{}] NCA did not converge'.format(cls_name) with pytest.warns(ConvergenceWarning, match=re.escape(msg)): nca.fit(iris_data, iris_target)
def test_init_transformation(): rng = np.random.RandomState(42) X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0) # Start learning from scratch nca = NeighborhoodComponentsAnalysis(init='identity') nca.fit(X, y) # Initialize with random nca_random = NeighborhoodComponentsAnalysis(init='random') nca_random.fit(X, y) # Initialize with auto nca_auto = NeighborhoodComponentsAnalysis(init='auto') nca_auto.fit(X, y) # Initialize with PCA nca_pca = NeighborhoodComponentsAnalysis(init='pca') nca_pca.fit(X, y) # Initialize with LDA nca_lda = NeighborhoodComponentsAnalysis(init='lda') nca_lda.fit(X, y) init = rng.rand(X.shape[1], X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) nca.fit(X, y) # init.shape[1] must match X.shape[1] init = rng.rand(X.shape[1], X.shape[1] + 1) nca = NeighborhoodComponentsAnalysis(init=init) assert_raise_message( ValueError, 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).'.format( init.shape[1], X.shape[1]), nca.fit, X, y) # init.shape[0] must be <= init.shape[1] init = rng.rand(X.shape[1] + 1, X.shape[1]) nca = NeighborhoodComponentsAnalysis(init=init) assert_raise_message( ValueError, 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).'.format( init.shape[0], init.shape[1]), nca.fit, X, y) # init.shape[0] must match n_components init = rng.rand(X.shape[1], X.shape[1]) n_components = X.shape[1] - 2 nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred dimensionality of the ' 'projected space `n_components` ({}) does not match ' 'the output dimensionality of the given ' 'linear transformation `init` ({})!'.format(n_components, init.shape[0]), nca.fit, X, y)
ax.plot(*line, c=cm.Set1(y[j]), linewidth=5 * thickness[j]) # we consider only point 3 i = 3 # Plot bonds linked to sample i in the original space relate_point(X, i, ax) ax.set_title("Original points") ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) ax.axis('equal') # Learn an embedding with NeighborhoodComponentsAnalysis nca = NeighborhoodComponentsAnalysis(max_iter=30, random_state=random_state) nca = nca.fit(X, y) # Plot the points after transformation with NeighborhoodComponentsAnalysis plt.figure() ax2 = plt.gca() # Get the embedding and find the new nearest neighbors X_embedded = nca.transform(X) relate_point(X_embedded, i, ax2) for i in range(len(X)): ax2.text(X_embedded[i, 0], X_embedded[i, 1], str(i), va='center',
def learn_spect_proj(X, y=None, spectral_proj_name='pca', clustering_meth='KMeans', clustering_options=CLUSTERING_OPTIONS, kwargs_feat=None, kwargs_clust=None): """ Function to learn each of the important spectral projection :param X: the fvs, an array of size n*k :param y: the classes, an array of size n :param spectral_proj_name: a string of the name of the featurizer :param args: extra argument to be passed to the featurizer class :return: a matrix in the form of a numpy array """ clustering_options = set(clustering_options) kwargs_feat = kwargs_feat or {'n_components': 10} kwargs_clust = kwargs_clust or {} assert clustering_meth in clustering_options, 'clustering options must one of {}'.format( ', '.join(map(str, clustering_options))) clusterer_m = getattr(importlib.import_module('sklearn.cluster'), clustering_meth) if spectral_proj_name == 'keep_features': indices = kwargs_feat['indices'] proj_matrix = np.zeros((X.shape[1], len(indices))) for idx in range(len(indices)): proj_matrix[indices[idx], idx] = 1 elif spectral_proj_name == 'pca': pca = PCA(**kwargs_feat) pca.fit(X) proj_matrix = pca.components_.T # elif spectral_proj_name == 'pseudo_pca': # # make the pseudo pca proj matrix # ppca = PseudoPca(**kwargs_feat) # ppca.fit(X) # proj_matrix = ppca.proj_mat.T elif spectral_proj_name == 'lda': lda = LDA(**kwargs_feat) lda.fit(X, y) n_components = kwargs_feat['n_components'] proj_matrix = lda.scalings_[:, :n_components] elif spectral_proj_name == 'unsupervised_lda': n_components = kwargs_feat['n_components'] if y is not None: print('y will be replaced by classes found by the chosen clusterer') if 'n_clusters' in clusterer_m.__init__.__code__.co_varnames: y = clusterer_m(n_clusters=n_components + 1, **kwargs_clust).fit_predict(X) else: y = clusterer_m(**kwargs_clust).fit_predict(X) lda = LDA(**kwargs_feat) lda.fit(X, y) proj_matrix = lda.scalings_[:, :n_components] elif spectral_proj_name == 'nca': nca = NCA(**kwargs_feat) nca.fit(X, y) proj_matrix = nca.components_.T elif spectral_proj_name == 'unsupervised_nca': n_components = kwargs_feat['n_components'] if y is not None: print('y will be replaced by classes found by the chosen clusterer') if 'n_clusters' in clusterer_m.__init__.__code__.co_varnames: y = clusterer_m(n_clusters=n_components + 1, **kwargs_clust).fit_predict(X) else: y = clusterer_m(**kwargs_clust).fit_predict(X) nca = NCA(**kwargs_feat) nca.fit(X, y) proj_matrix = nca.components_.T elif spectral_proj_name == 'linear regression': lr = LinearRegression(**kwargs_feat) lr.fit(X, y) proj_matrix = lr.coef_.T else: all_spectral_proj = ', '.join(['keep_features', 'pca', 'lda', 'pseudo_pca', 'unsupervised_lda', 'unsupervised_nca', 'nca', 'linear regression']) raise ValueError(f'the spectral projector must be one of: {all_spectral_proj}') return proj_matrix