async def test_insert_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue(await lsh.has_key("a")) self.assertTrue(await lsh.has_key("b")) for i, H in enumerate(await lsh.keys.get("a")): self.assertTrue("a" in await lsh.hashtables[i].get(H)) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) with self.assertRaises(ValueError): await lsh.insert("c", m3)
def eg2(): mg = WeightedMinHashGenerator(10, 5) m1 = mg.minhash(v1) m2 = mg.minhash(v2) m3 = mg.minhash(v3) print("Estimated Jaccard m1, m2", m1.jaccard(m2)) print("Estimated Jaccard m1, m3", m1.jaccard(m3)) # Create LSH index lsh = WeightedMinHashLSH(threshold=0.1, sample_size=5) lsh.insert("m2", m2) lsh.insert("m3", m3) result = lsh.query(m1) print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)
def test_query(self): lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) self.assertRaises(ValueError, lsh.query, m3)
def test_minhash(self): mg = WeightedMinHashGenerator(2, 4, 1) m = mg.minhash([1, 3]) self.assertIsInstance(m, WeightedMinHash) self.assertEqual(len(m.hashvalues), 4) self.assertEqual(len(m), 4) self.assertTrue(m.hashvalues.dtype == np.int)
def test_minhash(self): mg = WeightedMinHashGenerator(2, 4, 1) m = mg.minhash([1,3]) self.assertIsInstance(m, WeightedMinHash) self.assertEqual(len(m.hashvalues), 4) self.assertEqual(len(m), 4) self.assertTrue(m.hashvalues.dtype == np.int)
async def test_query_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) result = await lsh.query(m1) self.assertTrue("a" in result) result = await lsh.query(m2) self.assertTrue("b" in result) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) with self.assertRaises(ValueError): await lsh.query(m3)
def test_insert(self): lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) self.assertRaises(ValueError, lsh.insert, "c", m3)
def test_pickle(self): lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) lsh2 = pickle.loads(pickle.dumps(lsh)) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test__H(self): ''' Check _H output consistent bytes length given the same concatenated hash value size ''' mg = WeightedMinHashGenerator(100, sample_size=128) for l in range(2, mg.sample_size + 1, 16): m = mg.minhash(np.random.randint(1, 99999999, 100)) lsh = MinHashLSH(num_perm=128) lsh.insert("m", m) sizes = [len(H) for ht in lsh.hashtables for H in ht] self.assertTrue(all(sizes[0] == s for s in sizes))
def test_pickle(self): lsh = MinHashLSH(threshold=0.5, num_perm=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) result = lsh.query(m1) self.assertTrue("a" in result) result = lsh.query(m2) self.assertTrue("b" in result)
def test_insert(self): lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(len(t) >= 1) items = [] for H in t: items.extend(t[H]) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue("a" in lsh) self.assertTrue("b" in lsh) for i, H in enumerate(lsh.keys["a"]): self.assertTrue("a" in lsh.hashtables[i][H]) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) self.assertRaises(ValueError, lsh.insert, "c", m3)
async def test__H_mongo(self): """ Check _H output consistent bytes length given the same concatenated hash value size """ mg = WeightedMinHashGenerator(100, sample_size=128) for l in range(2, mg.sample_size + 1, 16): m = mg.minhash(np.random.randint(1, 99999999, 100)) async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, num_perm=128) as lsh: await lsh.insert("m", m) fs = (ht.keys() for ht in lsh.hashtables) hashtables = await asyncio.gather(*fs) sizes = [len(H) for H in hashtables] self.assertTrue(all(sizes[0] == s for s in sizes))
async def test_pickle_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) pickled = pickle.dumps(lsh) async with pickle.loads(pickled) as lsh2: result = await lsh2.query(m1) self.assertTrue("a" in result) result = await lsh2.query(m2) self.assertTrue("b" in result)
def test_remove(self): lsh = WeightedMinHashLSH(threshold=0.5, sample_size=4) mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) lsh.insert("a", m1) lsh.insert("b", m2) lsh.remove("a") self.assertTrue("a" not in lsh.keys) for table in lsh.hashtables: for H in table: self.assertGreater(len(table[H]), 0) self.assertTrue("a" not in table[H]) self.assertRaises(ValueError, lsh.remove, "c")
async def test_remove_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) await lsh.remove("a") self.assertTrue(not await lsh.has_key("a")) for table in lsh.hashtables: for H in await table.keys(): self.assertGreater(len(await table.get(H)), 0) self.assertTrue("a" not in await table.get(H)) with self.assertRaises(ValueError): await lsh.remove("c")
def test_pickle(self): mg = WeightedMinHashGenerator(4, 10, 1) m = mg.minhash([1, 2, 3, 4]) p = pickle.loads(pickle.dumps(m)) self.assertEqual(p.seed, m.seed) self.assertTrue(np.array_equal(p.hashvalues, m.hashvalues))
def test_pickle(self): mg = WeightedMinHashGenerator(4, 10, 1) m = mg.minhash([1,2,3,4]) p = pickle.loads(pickle.dumps(m)) self.assertEqual(p.seed, m.seed) self.assertTrue(np.array_equal(p.hashvalues, m.hashvalues))