def test_sample_training(): X_train, X_test = _get_mnist_data() for no_trees, expected_precision in ((1, 0.05), (5, 0.3), (10, 0.5), (50, 0.9)): tree = RPForest(leaf_size=10, no_trees=no_trees) # Fit on quarter of data X_sample = X_train[:X_train.shape[0] / 4] tree.fit(X_sample) # Clear and index everything tree.clear() for i, x in enumerate(X_train): tree.index(i, x) tree._X = X_train precision = 0.0 X_train /= np.linalg.norm(X_train, axis=1)[:, np.newaxis] for x_test in X_test: true_nns = np.argsort(-np.dot(X_train, x_test))[:10] nns = tree.query(x_test, 10)[:10] precision += len(set(nns) & set(true_nns)) / 10.0 precision /= X_test.shape[0] assert precision >= expected_precision
## RPFOREST TEST from rpforest import RPForest leaf_size = 5 n_trees = 20 name = 'RPForest(leaf_size=%d, n_trees=%d)' % (leaf_size, n_trees) model = RPForest(leaf_size=leaf_size, no_trees=n_trees) #fitting features = features.copy(order='C') #something related to Cython error model.fit(features) model.clear() #indexing for i, x in enumerate(features): t = Timer() with t: model.index(dict_feat[i], x.tolist()) #querying for i in range(features.shape[0]): t = Timer() with t: results = model.get_candidates(features[i]) print 'queried', dict_feat[i], 'results', results import timeit class Timer: def __init__(self, timer=None, disable_gc=False, verbose=True): if timer is None: timer = timeit.default_timer