def test_approx_nn(method, traindata, testdata, m, alpha): avg_distance = 0 if method == "hashing": #train lsh = LocalitySensitiveHash(traindata, D=1000, m=m) #time test t0 = time.time() for testdoc_id, testdoc in testdata.iteritems(): avg_distance += lsh.nearest_neighbor(testdoc, depth=HW2_DEPTH).distance if method == "kdtree": #train kdt = KDTree(D) for i, document in traindata.iteritems(): key = make_dense(document) kdt.insert(key, i) #time test t0 = time.time() for _, testdoc in testdata.iteritems(): key = make_dense(testdoc) neighbor = kdt.nearest(key, alpha) avg_distance += EvalUtil.distance(testdoc, docdata[neighbor]) #finish timing, report results mean_time = (time.time() - t0) / len(testdata) mean_distance = avg_distance / len(testdata) return TestResult(method, m=m, D=D, alpha=alpha, avg_time=mean_time, avg_distance=mean_distance)
def test_approx_nn(method, traindata, testdata, m, alpha): avg_distance = 0 if method == "hashing": #train lsh = LocalitySensitiveHash(traindata, D=1000, m=m) #time test t0 = time.time() for testdoc_id, testdoc in testdata.iteritems(): avg_distance += lsh.nearest_neighbor(testdoc, depth = HW2_DEPTH).distance if method == "kdtree": #train kdt = KDTree(D) for i, document in traindata.iteritems(): key = make_dense(document) kdt.insert(key, i) #time test t0 = time.time() for _, testdoc in testdata.iteritems(): key = make_dense(testdoc) neighbor = kdt.nearest(key, alpha) avg_distance += EvalUtil.distance(testdoc, docdata[neighbor]) #finish timing, report results mean_time = (time.time() - t0) / len(testdata) mean_distance = avg_distance / len(testdata) return TestResult(method, m=m, D=D, alpha = alpha, avg_time=mean_time, avg_distance=mean_distance)
def check_bin(self, document, hashed_document, cur_nearest): """ Checks the documents that are hashed to the given bin and updates with nearest neighbor found. @param document: dict[int => int/float] - list of documents @param hashed_document: [bool] - hashed document @param cur_nearest: NeighborDistance - the currently (approximately) nearest neighbor """ # pdb.set_trace() for hashed_doc_id in self.hashed_documents.get(self.get_bin(document), []): #[self.get_bin(document)]: # Compute distance between document and each doc in the bin # Note: if the document is in the dataset it will be its own nearest neighbor dist = EvalUtil.distance(document, self.documents[hashed_doc_id]) # dist = 0.0 # cur_doc = self.documents[hashed_doc_id] # for key in set(document.keys()).union(set(cur_doc.keys())): # dist += math.pow(document.get(key,0.0) - cur_doc.get(key,0.0),2) # dist = math.sqrt(dist); # Check if this hashed_doc_id is the nearest neighbor if dist < cur_nearest.distance: cur_nearest.doc_id = hashed_doc_id cur_nearest.distance = dist
def test_kd_tree(n, D, n_test, alphas): """ Tests the query time and distance for a random data set and test set @param n: int - the number of points of the dataset @param D: int - the dimension of the data points @param n_test: int - the number of points to test @param alphas: [float] - a set of alphas to test @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query """ documents = RandomData.random_dataset(n, DOCDIM) test_documents = RandomData.random_dataset(n_test, DOCDIM) rand_tree = KDTree(DOCDIM) for i, document in documents.iteritems(): key = [document.get(idx) for idx in xrange(0, DOCDIM)] rand_tree.insert(key, i) times = [] for alpha in alphas: start_time = time.clock() cum_dist = 0.0 for i, test_document in test_documents.iteritems(): key = [test_document.get(idx) for idx in xrange(0, DOCDIM)] doc_id = rand_tree.nearest(key, alpha) cum_dist += EvalUtil.distance(test_document, documents[doc_id]) duration = time.clock() - start_time times.append( TestResult("KDTree", n, DOCDIM, alpha, duration / n_test, cum_dist / n_test)) return times
def test_kd_tree(train_docs, test_docs, D, alphas): """ Tests the query time and distance for the given training and testing sets @param D: int - the dimension of the data points @param alphas: [float] - a set of alphas to test @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query """ # Populate the tree with the training data print "Forming KD-tree" tree = KDTree(D) for i, document in train_docs.iteritems(): key = [document.get(idx,0) for idx in xrange(0, D)] tree.insert(key, i) print "Done" times = [] n = len(test_docs) for alpha in alphas: print "Computing average lookup time and distance to nearest neighbor for alpha = %d" %alpha start_time = time.clock() cum_dist = 0.0 for i, test_doc in test_docs.iteritems(): key = [test_doc.get(idx,0) for idx in xrange(0, D)] doc_id = tree.nearest(key, alpha) cum_dist += EvalUtil.distance(test_doc, train_docs[doc_id]) duration = time.clock() - start_time times.append(TestResult("KDTree", n, D, alpha, duration / n, cum_dist / n)) print "Average distance: %f" %(cum_dist / n) print "Average time: %f\n" %(duration / n) return times
def check_bin(self, document, hashed_document, cur_nearest): """ Checks the documents that are hashed to the given bin and updates with nearest neighbor found. @param document: dict[int => int/float] - a document @param hashed_document: [bool] - hashed document @param cur_nearest: NeighborDistance - the currently (approximately) nearest neighbor """ # TODO: Fill in code for checking a bin for the nearest neighbor # Code should look through all the documents in a bin and # update cur_nearest with the nearest one found, if closer than cur_nearest already is bin = self.convert_boolean_array_to_integer(hashed_document) if self.hashed_documents.get(bin) is not None: neighbor_docs = self.hashed_documents[bin] else: return cur_nearest cur_dist = cur_nearest.distance for neighbor_doc in neighbor_docs: distance = EvalUtil.distance(document, self.documents[neighbor_doc]) if distance < cur_dist: cur_dist = distance cur_nearest = NeighborDistance(neighbor_doc, cur_dist) # print cur_nearest.doc_id, cur_nearest.distance return cur_nearest
def test_lsh(n, D, n_test, alphas): """ Tests the query time and distance for a random data set and test set @param n: int - the number of points of the dataset @param D: int - the dimension of the data points @param n_test: int - the number of points to test @param alphas: [float] - a set of alphas to test @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query """ documents = RandomData.random_dataset(n, D) test_documents = RandomData.random_dataset(n_test, D) times = [] for m in ms: lsh = LocalitySensitiveHash(documents, D, m) print "Finished making locally sensitive hash." print "Running for", m, "projections..." start_time = time.clock() cum_dist = 0.0 print "Running for the test documents..." for i, test_document in test_documents.iteritems(): key = [test_document.get(idx) for idx in xrange(0, D)] doc = lsh.nearest_neighbor(test_document, 3) doc_id = doc.doc_id cum_dist += EvalUtil.distance(test_document, documents[doc_id]) print "Finished." duration = time.clock() - start_time times.append( TestResult("LSH", n, D, m, duration / n_test, cum_dist / n_test)) return times
def test_kd_tree(documents, test_documents, D, alphas): n = len(documents) n_test = len(test_documents) tree = KDTree(D) for i, document in documents.iteritems(): key = [document.get(idx) for idx in xrange(0, D)] tree.insert(key, i) print "Finished making random tree." times = [] for alpha in alphas: print "Running for alpha", alpha start_time = time.clock() cum_dist = 0.0 print "Running for the test documents..." for i, test_document in test_documents.iteritems(): if i%50 == 0: print " ", i, "of", len(test_documents) key = [test_document.get(idx) for idx in xrange(0, D)] doc_id = tree.nearest(key, alpha) cum_dist += EvalUtil.distance(test_document, documents[doc_id]) print "Finished." duration = time.clock() - start_time times.append(TestResult("KDTree", n, D, alpha, duration / n_test, cum_dist / n_test)) return times
def nearest_neighbor(self, document, alpha): """ Finds the approximate nearest neighbor for given document. @param document: dict[int => int/float] - document represented as dictionary of word ids => counts @param alpha: float - alpha for approximate k-nn """ hashed_document = self.hash_document(document) nearest_id = self.kdt.nearest(hashed_document, alpha) distance = EvalUtil.distance(document, self.documents[nearest_id]) return NeighborDistance(nearest_id, distance)
def check_bin(self, query_document, hashed_document, cur_nearest): """ Checks the documents that are hashed to the given bin and updates with nearest neighbor found. @param query_document: dict[int => int/float] - list of word counts by token id @param hashed_document: [bool] - hashed document (bin id) @param cur_nearest: NeighborDistance - the current candidate for nearest neighbor """ inthash = self.convert_boolean_array_to_integer(hashed_document) if not self.hashed_documents.has_key(inthash): return cur_nearest this_bin = self.hashed_documents[inthash] for doc_id in this_bin: doc = self.documents[doc_id] dist = EvalUtil.distance(doc, query_document) if dist < cur_nearest.distance: cur_nearest = NeighborDistance(doc, dist) return cur_nearest
def test_rptree(traindata, testdata, projdim): avg_distance = 0 #train, start timer, test rptree = GaussianRandomProjection(traindata, D=DOCDIM, m=projdim) t0 = time.time() for _, testdoc in testdata.iteritems(): neighbor = rptree.nearest_neighbor(testdoc, alpha=1) avg_distance += EvalUtil.distance(testdoc, rptree.documents[neighbor.doc_id]) #finish timing, report results mean_time = (time.time() - t0) / len(testdata) mean_distance = avg_distance / len(testdata) return TestResult(method="rpkdt", m=projdim, D=DOCDIM, alpha=1, avg_time=mean_time, avg_distance=mean_distance)
def test_grp(documents, test_documents, D, ms): times = [] n = len(documents) n_test = len(test_documents) for m in ms: grp = GaussianRandomProjection(documents, D, m) print "Finished making gaussian random projection." print "Running for", m, "projections..." start_time = time.clock() cum_dist = 0.0 print "Running for the test documents..." for i, test_document in test_documents.iteritems(): if i%50 == 0: print " ", i, "of", len(test_documents) key = [test_document.get(idx) for idx in xrange(0, D)] doc = grp.nearest_neighbor(test_document, 3) doc_id = doc.doc_id cum_dist += EvalUtil.distance(test_document, documents[doc_id]) print "Finished." duration = time.clock() - start_time times.append(TestResult("GRP", n, D, m, \ duration / n_test, \ cum_dist / n_test)) return times
def test_lsh(documents, test_documents, D, ms): times = [] n = len(documents) n_test = len(test_documents) for m in ms: lsh = LocalitySensitiveHash(documents, D, m) print "Finished making locally sensitive hash." print "Running for", m, "projections..." start_time = time.clock() cum_dist = 0.0 print "Running for the test documents..." for i, test_document in test_documents.iteritems(): if i%50 == 0: print " ", i, "of", len(test_documents) key = [test_document.get(idx) for idx in xrange(0, D)] doc = lsh.nearest_neighbor(test_document, 3) doc_id = doc.doc_id cum_dist += EvalUtil.distance(test_document, documents[doc_id]) print "Finished." duration = time.clock() - start_time times.append(TestResult("LSH", n, D, m, \ duration / n_test, \ cum_dist / n_test)) return times