Пример #1
0
def test_approx_nn(method, traindata, testdata, m, alpha):
    avg_distance = 0
    if method == "hashing":
        #train
        lsh = LocalitySensitiveHash(traindata, D=1000, m=m)
        #time test
        t0 = time.time()
        for testdoc_id, testdoc in testdata.iteritems():
            avg_distance += lsh.nearest_neighbor(testdoc,
                                                 depth=HW2_DEPTH).distance
    if method == "kdtree":
        #train
        kdt = KDTree(D)
        for i, document in traindata.iteritems():
            key = make_dense(document)
            kdt.insert(key, i)
        #time test
        t0 = time.time()
        for _, testdoc in testdata.iteritems():
            key = make_dense(testdoc)
            neighbor = kdt.nearest(key, alpha)
            avg_distance += EvalUtil.distance(testdoc, docdata[neighbor])

    #finish timing, report results
    mean_time = (time.time() - t0) / len(testdata)
    mean_distance = avg_distance / len(testdata)
    return TestResult(method,
                      m=m,
                      D=D,
                      alpha=alpha,
                      avg_time=mean_time,
                      avg_distance=mean_distance)
Пример #2
0
def test_approx_nn(method, traindata, testdata, m, alpha):
        avg_distance = 0
        if method == "hashing":
            #train
            lsh = LocalitySensitiveHash(traindata, D=1000, m=m)
            #time test
            t0 = time.time()
            for testdoc_id, testdoc in testdata.iteritems():
                avg_distance += lsh.nearest_neighbor(testdoc, depth = HW2_DEPTH).distance
        if method == "kdtree":
            #train
            kdt = KDTree(D)
            for i, document in traindata.iteritems():
                key = make_dense(document)
                kdt.insert(key, i)
            #time test
            t0 = time.time()
            for _, testdoc in testdata.iteritems():
                key = make_dense(testdoc)
                neighbor = kdt.nearest(key, alpha)
                avg_distance += EvalUtil.distance(testdoc, docdata[neighbor])

        #finish timing, report results
        mean_time = (time.time() - t0) / len(testdata)
        mean_distance = avg_distance   / len(testdata)
        return TestResult(method, m=m, D=D, alpha = alpha, avg_time=mean_time, avg_distance=mean_distance)
Пример #3
0
    def check_bin(self, document, hashed_document, cur_nearest):
        """
        Checks the documents that are hashed to the given bin and updates with
        nearest neighbor found.
        @param document: dict[int => int/float] - list of documents
        @param hashed_document: [bool] - hashed document
        @param cur_nearest: NeighborDistance - the currently (approximately) nearest neighbor
        """
        # pdb.set_trace()
        for hashed_doc_id in self.hashed_documents.get(self.get_bin(document),
                                                       []):
            #[self.get_bin(document)]:

            # Compute distance between document and each doc in the bin
            # Note: if the document is in the dataset it will be its own nearest neighbor
            dist = EvalUtil.distance(document, self.documents[hashed_doc_id])
            # dist = 0.0
            # cur_doc = self.documents[hashed_doc_id]
            # for key in set(document.keys()).union(set(cur_doc.keys())):
            #     dist += math.pow(document.get(key,0.0) - cur_doc.get(key,0.0),2)
            # dist = math.sqrt(dist);

            # Check if this hashed_doc_id is the nearest neighbor
            if dist < cur_nearest.distance:
                cur_nearest.doc_id = hashed_doc_id
                cur_nearest.distance = dist
Пример #4
0
def test_kd_tree(n, D, n_test, alphas):
    """
    Tests the query time and distance for a random data set and test set
    @param n: int - the number of points of the dataset
    @param D: int - the dimension of the data points
    @param n_test: int - the number of points to test
    @param alphas: [float] - a set of alphas to test
    @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query
    """
    documents = RandomData.random_dataset(n, DOCDIM)
    test_documents = RandomData.random_dataset(n_test, DOCDIM)

    rand_tree = KDTree(DOCDIM)
    for i, document in documents.iteritems():
        key = [document.get(idx) for idx in xrange(0, DOCDIM)]
        rand_tree.insert(key, i)

    times = []
    for alpha in alphas:
        start_time = time.clock()
        cum_dist = 0.0
        for i, test_document in test_documents.iteritems():
            key = [test_document.get(idx) for idx in xrange(0, DOCDIM)]
            doc_id = rand_tree.nearest(key, alpha)
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        duration = time.clock() - start_time
        times.append(
            TestResult("KDTree", n, DOCDIM, alpha, duration / n_test,
                       cum_dist / n_test))
    return times
Пример #5
0
def test_kd_tree(train_docs, test_docs, D, alphas):
	"""
	Tests the query time and distance for the given training and testing sets
	@param D: int - the dimension of the data points
	@param alphas: [float] - a set of alphas to test
	@return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query
	"""

	# Populate the tree with the training data
	print "Forming KD-tree"
	tree = KDTree(D)
	for i, document in train_docs.iteritems():
		key = [document.get(idx,0) for idx in xrange(0, D)]
		tree.insert(key, i)
	print "Done"

	times = []
	n = len(test_docs)
	for alpha in alphas:
		print "Computing average lookup time and distance to nearest neighbor for alpha = %d" %alpha
		start_time = time.clock()
		cum_dist = 0.0
		for i, test_doc in test_docs.iteritems():
			key = [test_doc.get(idx,0) for idx in xrange(0, D)]
			doc_id = tree.nearest(key, alpha)
			cum_dist += EvalUtil.distance(test_doc, train_docs[doc_id])
		duration = time.clock() - start_time
		times.append(TestResult("KDTree", n, D, alpha, duration / n, cum_dist / n))
		print "Average distance: %f" %(cum_dist / n)
		print "Average time: %f\n" %(duration / n)
	return times
Пример #6
0
    def check_bin(self, document, hashed_document, cur_nearest):
        """
        Checks the documents that are hashed to the given bin and updates with
        nearest neighbor found.
        @param document: dict[int => int/float] - a document
        @param hashed_document: [bool] - hashed document
        @param cur_nearest: NeighborDistance - the currently (approximately) nearest neighbor
        """
        # TODO: Fill in code for checking a bin for the nearest neighbor
        #       Code should look through all the documents in a bin and
        #       update cur_nearest with the nearest one found, if closer than cur_nearest already is

        bin = self.convert_boolean_array_to_integer(hashed_document)
        if self.hashed_documents.get(bin) is not None:
            neighbor_docs = self.hashed_documents[bin]
        else:
            return cur_nearest

        cur_dist = cur_nearest.distance

        for neighbor_doc in neighbor_docs:
            distance = EvalUtil.distance(document,
                                         self.documents[neighbor_doc])
            if distance < cur_dist:
                cur_dist = distance
                cur_nearest = NeighborDistance(neighbor_doc, cur_dist)

        # print cur_nearest.doc_id, cur_nearest.distance
        return cur_nearest
Пример #7
0
def test_lsh(n, D, n_test, alphas):
    """
    Tests the query time and distance for a random data set and test set
    @param n: int - the number of points of the dataset
    @param D: int - the dimension of the data points
    @param n_test: int - the number of points to test
    @param alphas: [float] - a set of alphas to test
    @return [TestResult] array of objects of class TestResult, which has the average time and distance for a single query
    """
    documents = RandomData.random_dataset(n, D)
    test_documents = RandomData.random_dataset(n_test, D)

    times = []
    for m in ms:
        lsh = LocalitySensitiveHash(documents, D, m)
        print "Finished making locally sensitive hash."
        print "Running for", m, "projections..."
        start_time = time.clock()
        cum_dist = 0.0
        print "Running for the test documents..."
        for i, test_document in test_documents.iteritems():
            key = [test_document.get(idx) for idx in xrange(0, D)]
            doc = lsh.nearest_neighbor(test_document, 3)
            doc_id = doc.doc_id
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        print "Finished."
        duration = time.clock() - start_time
        times.append(
            TestResult("LSH", n, D, m, duration / n_test, cum_dist / n_test))
    return times
Пример #8
0
def test_kd_tree(documents, test_documents, D, alphas):
    n = len(documents)
    n_test = len(test_documents)

    tree = KDTree(D)
    for i, document in documents.iteritems():
        key = [document.get(idx) for idx in xrange(0, D)]
        tree.insert(key, i)

    print "Finished making random tree."
    times = []
    for alpha in alphas:
        print "Running for alpha", alpha
        start_time = time.clock()
        cum_dist = 0.0
        print "Running for the test documents..."
        for i, test_document in test_documents.iteritems():
            if i%50 == 0:
                print "  ", i, "of", len(test_documents)
            key = [test_document.get(idx) for idx in xrange(0, D)]
            doc_id = tree.nearest(key, alpha)
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        print "Finished."
        duration = time.clock() - start_time
        times.append(TestResult("KDTree", n, D, alpha, duration / n_test, cum_dist / n_test))
    return times
 def nearest_neighbor(self, document, alpha):
     """
     Finds the approximate nearest neighbor for given document.
     @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
     @param alpha: float - alpha for approximate k-nn
     """
     hashed_document = self.hash_document(document)
     nearest_id = self.kdt.nearest(hashed_document, alpha)
     distance = EvalUtil.distance(document, self.documents[nearest_id])
     return NeighborDistance(nearest_id, distance)
 def nearest_neighbor(self, document, alpha):
     """
     Finds the approximate nearest neighbor for given document.
     @param document: dict[int => int/float] - document represented as dictionary of word ids => counts
     @param alpha: float - alpha for approximate k-nn
     """
     hashed_document = self.hash_document(document)
     nearest_id = self.kdt.nearest(hashed_document, alpha)
     distance = EvalUtil.distance(document, self.documents[nearest_id])
     return NeighborDistance(nearest_id, distance)
Пример #11
0
 def check_bin(self, query_document, hashed_document, cur_nearest):
     """
     Checks the documents that are hashed to the given bin and updates with
     nearest neighbor found.
     @param query_document: dict[int => int/float] - list of word counts by token id
     @param hashed_document: [bool] - hashed document (bin id)
     @param cur_nearest: NeighborDistance - the current candidate for nearest neighbor
     """
     inthash = self.convert_boolean_array_to_integer(hashed_document)
     if not self.hashed_documents.has_key(inthash):
         return cur_nearest
     this_bin = self.hashed_documents[inthash]
     for doc_id in this_bin:
         doc = self.documents[doc_id]
         dist = EvalUtil.distance(doc, query_document)
         if dist < cur_nearest.distance:
             cur_nearest = NeighborDistance(doc, dist)
     return cur_nearest
Пример #12
0
 def check_bin(self, query_document, hashed_document, cur_nearest):
     """
     Checks the documents that are hashed to the given bin and updates with
     nearest neighbor found.
     @param query_document: dict[int => int/float] - list of word counts by token id
     @param hashed_document: [bool] - hashed document (bin id)
     @param cur_nearest: NeighborDistance - the current candidate for nearest neighbor
     """
     inthash = self.convert_boolean_array_to_integer(hashed_document)
     if not self.hashed_documents.has_key(inthash):
         return cur_nearest
     this_bin = self.hashed_documents[inthash]
     for doc_id in this_bin:
         doc = self.documents[doc_id]
         dist = EvalUtil.distance(doc, query_document)
         if dist < cur_nearest.distance:
             cur_nearest = NeighborDistance(doc, dist)
     return cur_nearest
Пример #13
0
def test_rptree(traindata, testdata, projdim):
    avg_distance = 0

    #train, start timer, test
    rptree = GaussianRandomProjection(traindata, D=DOCDIM, m=projdim)
    t0 = time.time()
    for _, testdoc in testdata.iteritems():
        neighbor = rptree.nearest_neighbor(testdoc, alpha=1)
        avg_distance += EvalUtil.distance(testdoc,
                                          rptree.documents[neighbor.doc_id])

    #finish timing, report results
    mean_time = (time.time() - t0) / len(testdata)
    mean_distance = avg_distance / len(testdata)
    return TestResult(method="rpkdt",
                      m=projdim,
                      D=DOCDIM,
                      alpha=1,
                      avg_time=mean_time,
                      avg_distance=mean_distance)
Пример #14
0
def test_grp(documents, test_documents, D, ms):
    times = []
    n = len(documents)
    n_test = len(test_documents)
    for m in ms:
        grp = GaussianRandomProjection(documents, D, m)
        print "Finished making gaussian random projection."
        print "Running for", m, "projections..."
        start_time = time.clock()
        cum_dist = 0.0
        print "Running for the test documents..."
        for i, test_document in test_documents.iteritems():
            if i%50 == 0:
                print "  ", i, "of", len(test_documents)
            key = [test_document.get(idx) for idx in xrange(0, D)]
            doc = grp.nearest_neighbor(test_document, 3)
            doc_id = doc.doc_id
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        print "Finished."
        duration = time.clock() - start_time
        times.append(TestResult("GRP", n, D, m, \
                                duration / n_test, \
                                cum_dist / n_test))
    return times
Пример #15
0
def test_lsh(documents, test_documents, D, ms):
    times = []
    n = len(documents)
    n_test = len(test_documents)
    for m in ms:
        lsh = LocalitySensitiveHash(documents, D, m)
        print "Finished making locally sensitive hash."
        print "Running for", m, "projections..."
        start_time = time.clock()
        cum_dist = 0.0
        print "Running for the test documents..."
        for i, test_document in test_documents.iteritems():
            if i%50 == 0:
                print "  ", i, "of", len(test_documents)
            key = [test_document.get(idx) for idx in xrange(0, D)]
            doc = lsh.nearest_neighbor(test_document, 3)
            doc_id = doc.doc_id
            cum_dist += EvalUtil.distance(test_document, documents[doc_id])
        print "Finished."
        duration = time.clock() - start_time
        times.append(TestResult("LSH", n, D, m, \
                                duration / n_test, \
                                cum_dist / n_test))
    return times