class ImageLSH(object): def __init__(self, input_dir, get_node=default_get_node): self.last_time = time.time() self.get_node = get_node self._create_from_dir_recur(input_dir) def record(self): last = self.last_time self.last_time = time.time() print "Spent:",self.last_time - last def _create_from_dir_recur(self, input_dir): print "Start searching for mat files in", input_dir matches = [] for root, dirnames, filenames in os.walk(input_dir): for filename in fnmatch.filter(filenames, '*.mat'): matches.append(os.path.join(root, filename)) print "Found %d files." self.record() self.lsh = LSHash(8, 784) for f in matches: n = self.get_node(f) self.lsh.index(n.feature, extra_data=n.path) print "Done constructing tree." self.record() def get_nearest_neighbors(self, mat_file, k): node = self.get_node(mat_file) return self.lsh.query(node.feature, num_results=k) print "Done searching. Elapsed: %s" % (time.time() - self.start_time)
def brute_force(train_points, test_point, k): distances = [(p, l2(test_point, p)) for p in train_points] sorted_distances = sorted(distances, key=lambda x: x[1]) return sorted_distances[:k] if __name__ == "__main__": num_dimension = 2000 num_samples = 100000 num_test = 10 k = 5 X = np.random.uniform(0, 100, size=(num_samples, num_dimension)) Y = np.random.uniform(0, 100, size=(num_test, num_dimension)) train_points = [X[i,:] for i in range(num_samples)] lsh = LSHash(8, num_dimension) start_cons = time.time() for i in range(num_samples): lsh.index(X[i,:]) print "done construction in", time.time() - start_cons for i in range(num_test): test_point = Y[i,:] start_time = time.time() lsh_neighbors = lsh.query(test_point, num_results=k, distance_func='true_euclidean') done_lsh = time.time() brute_force_neighbors = brute_force(train_points, test_point, k) done_brute_force = time.time() print "lsh in:", done_lsh-start_time print "brute-force in:", done_brute_force-done_lsh assert len(lsh_neighbors) == k assert len(brute_force_neighbors) == k