def test_n_neighbors(): X = np.arange(12).reshape(4, 3) y = [1, 1, 2, 2] lmnn = LargeMarginNearestNeighbor(n_neighbors=2) assert_warns_message( UserWarning, '`n_neighbors` (=2) is not less than the number of ' 'samples in the smallest non-singleton class (=2). ' '`n_neighbors_` will be set to 1 for estimation.', lmnn.fit, X, y)
def test_neighbors_params(): from scipy.spatial.distance import hamming params = {'algorithm': 'brute', 'metric': hamming} lmnn = LargeMarginNearestNeighbor(n_neighbors=3, neighbors_params=params) lmnn.fit(iris_data, iris_target) components_hamming = lmnn.components_ lmnn = LargeMarginNearestNeighbor(n_neighbors=3) lmnn.fit(iris_data, iris_target) components_euclidean = lmnn.components_ assert (not np.allclose(components_hamming, components_euclidean))
def test_warm_start_validation(): X, y = datasets.make_classification(n_samples=30, n_features=5, n_classes=4, n_redundant=0, n_informative=5, random_state=0) lmnn = LargeMarginNearestNeighbor(warm_start=True, max_iter=5) lmnn.fit(X, y) X_less_features, y = \ datasets.make_classification(n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0) assert_raise_message( ValueError, 'The new inputs dimensionality ({}) does not ' 'match the input dimensionality of the ' 'previously learned transformation ({}).'.format( X_less_features.shape[1], lmnn.components_.shape[1]), lmnn.fit, X_less_features, y)
def test_pipeline_equivalency(): X = iris_data y = iris_target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Use init='identity' to ensure reproducibility lmnn_params = dict(n_neighbors=3, max_iter=10, init='identity', random_state=42) n_neighbors = 3 lmnn = LargeMarginNearestNeighbor(**lmnn_params) lmnn.fit(X_train, y_train) lmnn_pipe = make_lmnn_pipeline(**lmnn_params) lmnn_pipe.fit(X_train, y_train) pipe_transformation = lmnn_pipe.named_steps.lmnn.components_ assert_array_almost_equal(lmnn.components_, pipe_transformation) knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(lmnn.transform(X_train), y_train) score = knn.score(lmnn.transform(X_test), y_test) score_pipe = lmnn_pipe.score(X_test, y_test) assert (score == score_pipe)
def test_callback(): lmnn = LargeMarginNearestNeighbor(n_neighbors=3, callback='my_cb') assert_raise_message(ValueError, '`callback` is not callable.', lmnn.fit, iris_data, iris_target) max_iter = 10 def my_cb(transformation, n_iter): rem_iter = max_iter - n_iter print('{} iterations remaining...'.format(rem_iter)) # assert that my_cb is called old_stdout = sys.stdout sys.stdout = StringIO() lmnn = LargeMarginNearestNeighbor(n_neighbors=3, callback=my_cb, max_iter=max_iter, verbose=1) try: lmnn.fit(iris_data, iris_target) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout # check output assert ('{} iterations remaining...'.format(max_iter - 1) in out)
def test_singleton_class(): X = iris_data y = iris_target X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y) # one singleton class singleton_class = 1 ind_singleton, = np.where(y_tr == singleton_class) y_tr[ind_singleton] = 2 y_tr[ind_singleton[0]] = singleton_class lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_iter=30) lmnn.fit(X_tr, y_tr) # One non-singleton class X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, stratify=y) ind_1, = np.where(y_tr == 1) ind_2, = np.where(y_tr == 2) y_tr[ind_1] = 0 y_tr[ind_1[0]] = 1 y_tr[ind_2] = 0 y_tr[ind_2[0]] = 2 lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_iter=30) assert_raise_message( ValueError, 'LargeMarginNearestNeighbor needs at least 2 ' 'non-singleton classes, got 1.', lmnn.fit, X_tr, y_tr)
def test_same_lmnn_parallel(): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) lmnn = LargeMarginNearestNeighbor(n_neighbors=3) lmnn.fit(X_train, y_train) components = lmnn.components_ lmnn.set_params(n_jobs=3) lmnn.fit(X_train, y_train) components_parallel = lmnn.components_ assert_array_almost_equal(components, components_parallel)
def test_neighbors_iris(): # Sanity checks on the iris dataset # Puts three points of each label in the plane and performs a # nearest neighbor query on points near the decision boundary. lmnn = LargeMarginNearestNeighbor(n_neighbors=1) lmnn.fit(iris_data, iris_target) knn = KNeighborsClassifier(n_neighbors=lmnn.n_neighbors_) LX = lmnn.transform(iris_data) knn.fit(LX, iris_target) y_pred = knn.predict(LX) assert_array_equal(y_pred, iris_target) lmnn.set_params(n_neighbors=9) lmnn.fit(iris_data, iris_target) knn = KNeighborsClassifier(n_neighbors=lmnn.n_neighbors_) knn.fit(LX, iris_target) assert (knn.score(LX, iris_target) > 0.95)
def test_impostor_store(): k = 3 lmnn = LargeMarginNearestNeighbor(n_neighbors=k, init='identity', impostor_store='list') lmnn.fit(iris_data, iris_target) components_list = lmnn.components_ lmnn = LargeMarginNearestNeighbor(n_neighbors=k, init='identity', impostor_store='sparse') lmnn.fit(iris_data, iris_target) components_sparse = lmnn.components_ assert_array_almost_equal(components_list, components_sparse, err_msg='Toggling `impostor_store` results in ' 'a different solution.')
def test_neighbors_digits(): # Sanity check on the digits dataset # the 'brute' algorithm has been observed to fail if the input # dtype is uint8 due to overflow in distance calculations. X = digits_data.astype('uint8') y = digits_target n_samples, n_features = X.shape train_test_boundary = int(n_samples * 0.8) train = np.arange(0, train_test_boundary) test = np.arange(train_test_boundary, n_samples) X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] k = 1 lmnn = LargeMarginNearestNeighbor(n_neighbors=k, max_iter=30) lmnn.fit(X_train, y_train) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(lmnn.transform(X_train), y_train) score_uint8 = knn.score(lmnn.transform(X_test), y_test) knn.fit(lmnn.transform(X_train.astype(float)), y_train) score_float = knn.score(lmnn.transform(X_test.astype(float)), y_test) assert (score_uint8 == score_float)
def test_convergence_warning(): lmnn = LargeMarginNearestNeighbor(n_neighbors=3, max_iter=2, verbose=1) cls_name = lmnn.__class__.__name__ assert_warns_message(ConvergenceWarning, '[{}] LMNN did not converge'.format(cls_name), lmnn.fit, iris_data, iris_target)
def test_random_state(): """Assert that when having more than max_impostors (forcing sampling), the same impostors will be sampled given the same random_state and different impostors will be sampled given a different random_state leading to a different transformation""" X = iris_data y = iris_target # Use init='identity' to ensure reproducibility params = { 'n_neighbors': 3, 'max_impostors': 5, 'random_state': 1, 'max_iter': 10, 'init': 'identity' } lmnn = LargeMarginNearestNeighbor(**params) lmnn.fit(X, y) transformation_1 = lmnn.components_ lmnn = LargeMarginNearestNeighbor(**params) lmnn.fit(X, y) transformation_2 = lmnn.components_ # This assertion fails on 32bit systems if init='pca' assert_allclose(transformation_1, transformation_2) params['random_state'] = 2 lmnn = LargeMarginNearestNeighbor(**params) lmnn.fit(X, y) transformation_3 = lmnn.components_ assert (not np.allclose(transformation_2, transformation_3))
{\displaystyle \min _{\mathbf {M} }\sum _{i,j\in N_{i}}d({\vec {x}}_{i},{\vec {x}}_{j})+\lambda \sum _{i,j,l}\xi _{ijl}} {\displaystyle \forall _{i,j\in N_{i},l,y_{l}\neq y_{i}}}\forall _{{i,j\in N_{i},l,y_{l}\neq y_{i}}} {\displaystyle d({\vec {x}}_{i},{\vec {x}}_{j})+1-d({\vec {x}}_{i},{\vec {x}}_{l})\leq \xi _{ijl}}{\displaystyle d({\vec {x}}_{i},{\vec {x}}_{j})+1-d({\vec {x}}_{i},{\vec {x}}_{l})\leq \xi _{ijl}} {\displaystyle \xi _{ijl}\geq 0}\xi _{{ijl}}\geq 0 {\displaystyle \mathbf {M} \succeq 0}{\mathbf {M}}\succeq 0 For this coursework, PyLMNN package is used to compute LMNN for metric learning:https://pypi.org/project/PyLMNN/ """ # need pip install pylmnn from pylmnn import LargeMarginNearestNeighbor as LMNN # Set up the hyperparameters k_train, n_components, max_iter = 5, 25, 1000 # Instantiate the metric learner lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components= n_components) # Train the metric learner lmnn_original = lmnn.fit(original_train_list, Y_train) lmnn_test = lmnn_original.transform(original_test_list) lmnn_test= lmnn_test.T print(lmnn_test.shape) rank_k = [] for i in range(1,lmnn_test.shape[1]): rank_k.append(i) #initialise maP and accuracy scores avg_prec = 0 rank1_prec = []
def test_warm_start_effectiveness(): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) n_iter = 10 lmnn_warm = LargeMarginNearestNeighbor(n_neighbors=3, warm_start=True, max_iter=n_iter, random_state=0) lmnn_warm.fit(X_train, y_train) transformation_warm = lmnn_warm.components_ lmnn_warm.max_iter = 1 lmnn_warm.fit(X_train, y_train) transformation_warm_plus_one = lmnn_warm.components_ lmnn_cold = LargeMarginNearestNeighbor(n_neighbors=3, warm_start=False, max_iter=n_iter, random_state=0) lmnn_cold.fit(X_train, y_train) transformation_cold = lmnn_cold.components_ lmnn_cold.max_iter = 1 lmnn_cold.fit(X_train, y_train) transformation_cold_plus_one = lmnn_cold.components_ diff_warm = np.sum( np.abs(transformation_warm_plus_one - transformation_warm)) diff_cold = np.sum( np.abs(transformation_cold_plus_one - transformation_cold)) assert (diff_warm < 2.0, "Transformer changed significantly after one iteration even " "though it was warm-started.") assert (diff_cold > diff_warm, "Cold-started transformer changed less significantly than " "warm-started transformer after one iteration.")
# embedding loreal_data = np.load('/export/home//loreal_135_classification/em_training.npz') X_train, y_train = loreal_data['X'], loreal_data['y'] loreal_data = np.load('/export/home//loreal_135_classification/em_test.npz') X_test, y_test = loreal_data['X'], loreal_data['y'] ''' knn = KNeighborsClassifier(n_neighbors=10) # Train with no transformation (euclidean metric) knn.fit(X_train, y_train) # Test with euclidean metric acc = knn.score(X_test, y_test) print('KNN accuracy on test set: {}'.format(acc)) # LMNN is no longer a classifier but a transformer lmnn = LargeMarginNearestNeighbor(n_neighbors=10, verbose=1, max_iter=300) lmnn.fit(X_train, y_train) # Train with transformation learned by LMNN knn.fit(lmnn.transform(X_train), y_train) # Test with transformation learned by LMNN acc = knn.score(lmnn.transform(X_test), y_test) print('LMNN accuracy on test set: {}'.format(acc))
def test_init_transformation(): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) # Initialize with identity lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init='identity') lmnn.fit(X_train, y_train) # Initialize with PCA lmnn_pca = LargeMarginNearestNeighbor(n_neighbors=3, init='pca') lmnn_pca.fit(X_train, y_train) # Initialize with a transformation given by the user init = np.random.rand(X.shape[1], X.shape[1]) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init) lmnn.fit(X_train, y_train) # init.shape[1] must match X.shape[1] init = np.random.rand(X.shape[1], X.shape[1] + 1) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init) assert_raise_message( ValueError, 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).'.format( init.shape[1], X.shape[1]), lmnn.fit, X_train, y_train) # init.shape[0] must be <= init.shape[1] init = np.random.rand(X.shape[1] + 1, X.shape[1]) lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init) assert_raise_message( ValueError, 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).'.format( init.shape[0], init.shape[1]), lmnn.fit, X_train, y_train) # init.shape[0] must match n_components init = np.random.rand(X.shape[1], X.shape[1]) n_components = X.shape[1] - 2 lmnn = LargeMarginNearestNeighbor(n_neighbors=3, init=init, n_components=n_components) assert_raise_message( ValueError, 'The preferred embedding dimensionality ' '`n_components` ({}) does not match ' 'the output dimensionality of the given ' 'linear transformation `init` ({})!'.format(n_components, init.shape[0]), lmnn.fit, X_train, y_train)
from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris from pylmnn import LargeMarginNearestNeighbor as LMNN # Load a data set X, y = load_iris(return_X_y=True) # Split in training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42) # Set up the hyperparameters k_train, k_test, n_components, max_iter = 3, 3, X.shape[1], 180 # Instantiate the metric learner lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components=n_components) # Train the metric learner lmnn.fit(X_train, y_train) # Fit the nearest neighbors classifier knn = KNeighborsClassifier(n_neighbors=k_test) knn.fit(lmnn.transform(X_train), y_train) # Compute the k-nearest neighbor test accuracy after applying the learned transformation lmnn_acc = knn.score(lmnn.transform(X_test), y_test) print('LMNN accuracy on test set of {} points: {:.4f}'.format(X_test.shape[0], lmnn_acc))
import numpy as np from pylmnn import LargeMarginNearestNeighbor as LMNN csv = np.genfromtxt("data/numerical_train.csv", delimiter=',') csv_test = np.genfromtxt("data/numerical_test.csv", delimiter=',') n, d = csv.shape X_train = csv[:, :d - 1] y_train = csv[:, -1] X_test = csv_test[:, :d - 1] y_test = csv_test[:, -1] k_train, n_components, max_iter = 7, d - 1, 180 lmnn = LMNN(n_neighbors=k_train, max_iter=max_iter, n_components=n_components) print('learning the metric...') # Train the metric learner lmnn.fit(X_train, y_train) X_train_transformed = lmnn.transform(X_train) X_test_transformed = lmnn.transform(X_test) pickle.dump(X_train_transformed, open("data/numerical_train_transformed.pkl", 'wb')) pickle.dump(y_train, open("data/numerical_train_labels.pkl", 'wb')) pickle.dump(X_test_transformed, open("data/numerical_test_transformed.pkl", 'wb')) pickle.dump(y_test, open("data/numerical_test_labels.pkl", 'wb'))
acc1 = [] acc2 = [] acc3 = [] acc4 = [] T = [] T1 = [] T2 = [] T3 = [] T4 = [] for k in [9, 11, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29]: print('Running K={} ... ... '.format(k)) t0 = time.time() lmnn = LMNN(n_neighbors=k, max_iter=200, n_components=x.shape[1]) lmnn.fit(x_train, y_train) x_train_ = lmnn.transform(x_train) x_test_ = lmnn.transform(x_test) t1 = time.time() T.append(t1 - t0) print('LMNN Cost:', t1 - t0) knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='cosine', algorithm='brute') knn.fit(x_train_, y_train) lmnn_acc = knn.score(x_test_, y_test) acc1.append(lmnn_acc) t2 = time.time()