示例#1
0
    def test_insert(self):
        forest = MinHashLSHForest()
        mg = WeightedMinHashGenerator(10)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        forest.add("a", m1)
        forest.add("b", m2)

        self.assertTrue(forest.is_empty())
        for t in forest.hashtables:
            self.assertTrue(len(t) >= 1)
            items = []
            for H in t:
                items.extend(t[H])
            self.assertTrue("a" in items)
            self.assertTrue("b" in items)
        self.assertTrue("a" in forest)
        self.assertTrue("b" in forest)
        for i, H in enumerate(forest.keys["a"]):
            self.assertTrue("a" in forest.hashtables[i][H])

        forest.index()
        self.assertFalse(forest.is_empty())

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, forest.add, "c", m3)
示例#2
0
 def test_pickle(self):
     forest = MinHashLSHForest()
     mg = WeightedMinHashGenerator(10)
     m1 = mg.minhash(np.random.uniform(1, 10, 10))
     m2 = mg.minhash(np.random.uniform(1, 10, 10))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest2.query(m1, 2)
     self.assertTrue("a" in result)
     self.assertTrue("b" in result)
示例#3
0
 def _setup(self):
     d = "abcdefghijklmnopqrstuvwxyz"
     forest = MinHashLSHForest()
     for i in range(len(d)-2):
         key = d[i]
         m = MinHash()
         j = i + 3
         for s in d[i:j]:
             m.update(s.encode("utf8"))
         forest.add(key, m)
     forest.index()
     return forest
示例#4
0
 def test_pickle(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     forest.index()
     forest2 = pickle.loads(pickle.dumps(forest))
     result = forest2.query(m1, 1)
     self.assertTrue("a" in result)
     result = forest2.query(m2, 1)
     self.assertTrue("b" in result)
示例#5
0
    def test_query(self):
        forest = MinHashLSHForest()
        mg = WeightedMinHashGenerator(10)
        m1 = mg.minhash(np.random.uniform(1, 10, 10))
        m2 = mg.minhash(np.random.uniform(1, 10, 10))
        forest.add("a", m1)
        forest.add("b", m2)
        forest.index()
        result = forest.query(m1, 2)
        self.assertTrue("a" in result)
        self.assertTrue("b" in result)

        mg = WeightedMinHashGenerator(10, 5)
        m3 = mg.minhash(np.random.uniform(1, 10, 10))
        self.assertRaises(ValueError, forest.query, m3, 1)
示例#6
0
 def test_add_index(self):
     forest = MinHashLSHForest()
     m1 = MinHash()
     m1.update("a".encode("utf8"))
     m2 = MinHash()
     m2.update("b".encode("utf8"))
     forest.add("a", m1)
     forest.add("b", m2)
     self.assertTrue(forest.is_empty())
     for t in forest.hashtables:
         self.assertTrue(len(t) >= 1)
         items = []
         for H in t:
             items.extend(t[H])
         self.assertTrue("a" in items)
         self.assertTrue("b" in items)
     self.assertTrue("a" in forest)
     self.assertTrue("b" in forest)
     for i, H in enumerate(forest.keys["a"]):
         self.assertTrue("a" in forest.hashtables[i][H])
     m3 = MinHash(18)
     self.assertRaises(ValueError, forest.add, "c", m3)
     forest.index()
     self.assertFalse(forest.is_empty())