async def test_init_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.8) as lsh: self.assertTrue(await lsh.is_empty()) b1, r1 = lsh.b, lsh.r async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.8, weights=(0.2, 0.8)) as lsh: b2, r2 = lsh.b, lsh.r self.assertTrue(b1 < b2) self.assertTrue(r1 > r2)
async def test_insert_redis(self): async with AsyncMinHashLSH(storage_config=self._storage_config_redis, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue(await lsh.has_key("a")) self.assertTrue(await lsh.has_key("b")) for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))): res = await lsh.hashtables[i].get(H) self.assertTrue(pickle.dumps("a") in res) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.insert("c", m3)
async def test_insert_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: seq = [ 'aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer' ] objs = [MinHash(16) for _ in range(len(seq))] for e, obj in zip(seq, objs): for i in e: obj.update(i.encode('utf-8')) data = [(e, m) for e, m in zip(seq, objs)] for key, minhash in data: await lsh.insert(key, minhash) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue('aahh' in items) self.assertTrue('bhg' in items) self.assertTrue(await lsh.has_key('aahh')) self.assertTrue(await lsh.has_key('bhg')) for i, H in enumerate(await lsh.keys.get('aahhb')): self.assertTrue('aahhb' in await lsh.hashtables[i].get(H)) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.insert("c", m3)
async def test_remove_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) m3 = MinHash(16) m3.update("a".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) await lsh.insert("a1", m3) await lsh.remove("a") self.assertTrue(not await lsh.has_key("a")) self.assertTrue(await lsh.has_key('a1')) hashtable_correct = False for table in lsh.hashtables: for H in await table.keys(): table_vals = await table.get(H) self.assertGreater(len(table_vals), 0) self.assertTrue("a" not in table_vals) if 'a1' in table_vals: hashtable_correct = True self.assertTrue(hashtable_correct, 'Hashtable broken') with self.assertRaises(ValueError): await lsh.remove("c")
async def test_insertion_session_mongo(self): def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4) seq = frozenset(chain((''.join(s) for s in _chunked_str), ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer'))) objs = [MinHash(16) for _ in range(len(seq))] for e, obj in zip(seq, objs): for i in e: obj.update(i.encode('utf-8')) data = [(e, m) for e, m in zip(seq, objs)] async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: async with lsh.insertion_session(batch_size=1000) as session: fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data) await asyncio.gather(*fs) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue('aahhb' in items) self.assertTrue('kld' in items) self.assertTrue(await lsh.has_key('aahhb')) self.assertTrue(await lsh.has_key('kld')) for i, H in enumerate(await lsh.keys.get('aahh')): self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
async def test_insert_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue("a" in items) self.assertTrue("b" in items) self.assertTrue(await lsh.has_key("a")) self.assertTrue(await lsh.has_key("b")) for i, H in enumerate(await lsh.keys.get("a")): self.assertTrue("a" in await lsh.hashtables[i].get(H)) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) with self.assertRaises(ValueError): await lsh.insert("c", m3)
async def queries(): async with AsyncMinHashLSH(threshold=0.01, num_perm=256, storage_config={ 'type': 'aiomongo', 'basename': 'k'.encode('utf8'), 'mongo': { 'host': 'localhost', 'port': 27017, 'db': 'lsh' } }) as lsh: async with lsh.insertion_session(batch_size=1000) as session: pool = Pool(6) keys = next_key() res = [ x for x in pool.map(next_real_key, itertools.islice(keys, 1000)) ] while res: fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in res) res = [ x for x in pool.map(next_real_key, itertools.islice(keys, 1000)) ] await asyncio.gather(*fs) pool.close() pool.join()
async def test_remove_session_mongo(self): def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) _chunked_str = chunk( (random.choice(string.ascii_lowercase) for _ in range(10000)), 4) seq = frozenset( chain((''.join(s) for s in _chunked_str), ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer'))) objs = [MinHash(16) for _ in range(len(seq))] for e, obj in zip(seq, objs): for i in e: obj.update(i.encode('utf-8')) data = [(e, m) for e, m in zip(seq, objs)] keys_to_remove = ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer') keys_left = frozenset(seq) - frozenset(keys_to_remove) async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: async with lsh.insertion_session(batch_size=1000) as session: fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data) await asyncio.gather(*fs) async with lsh.delete_session(batch_size=3) as session: fs = (session.remove(key) for key in keys_to_remove) await asyncio.gather(*fs) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) for key in keys_to_remove: self.assertTrue( key not in items, '{0} in items, but should not be'.format(key)) for key in keys_left: self.assertTrue( key in items, '{0} not in items, but should be'.format(key)) for key in keys_to_remove: self.assertTrue( not (await lsh.has_key(key)), '<{0}> key should not be in LSH index'.format(key)) for key in keys_left: self.assertTrue(await lsh.has_key(key), '<{0}> key should be in LSH index'.format(key))
async def test_get_counts_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) counts = await lsh.get_counts() self.assertEqual(len(counts), lsh.b) for table in counts: self.assertEqual(sum(table.values()), 2)
async def test__H_mongo(self): """ Check _H output consistent bytes length given the same concatenated hash value size """ mg = WeightedMinHashGenerator(100, sample_size=128) for l in range(2, mg.sample_size + 1, 16): m = mg.minhash(np.random.randint(1, 99999999, 100)) async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, num_perm=128) as lsh: await lsh.insert("m", m) fs = (ht.keys() for ht in lsh.hashtables) hashtables = await asyncio.gather(*fs) sizes = [len(H) for H in hashtables] self.assertTrue(all(sizes[0] == s for s in sizes))
async def test_pickle_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) pickled = pickle.dumps(lsh) async with pickle.loads(pickled) as lsh2: result = await lsh2.query(m1) self.assertTrue("a" in result) result = await lsh2.query(m2) self.assertTrue("b" in result)
async def run_async_test(data: list, batch_size: int): executor = concurrent.futures.ThreadPoolExecutor(max_workers=100) lsh = MinHashLSH(storage_config=syncSTORAGE_CONFIG_REDIS, threshold=0.5, num_perm=16) await aioinsert_syncredis_with_executor(lsh, data, executor) await aioquery_syncredis(lsh, data, executor) async with AsyncMinHashLSH(storage_config=aioSTORAGE_CONFIG_REDIS, threshold=0.5, num_perm=16) as lsh2: await insert_aioredis(lsh2, data) async with AsyncMinHashLSH(storage_config=aioSTORAGE_CONFIG_REDIS, threshold=0.5, num_perm=16) as lsh3: await insertion_session_aioredis(lsh3, data, batch_size) await query_aioredis(lsh3, data) dsn = 'redis://{host}:{port}'.format(**aioSTORAGE_CONFIG_REDIS['redis']) redis = await aioredis.create_redis(dsn) await redis.flushdb() redis.close() await redis.wait_closed()
async def test_pickle_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) pickled = pickle.dumps(lsh) async with pickle.loads(pickled) as lsh2: result = await lsh2.query(m1) self.assertTrue("a" in result) result = await lsh2.query(m2) self.assertTrue("b" in result) await lsh2.close()
async def test__H_redis(self): """ Check _H output consistent bytes length given the same concatenated hash value size """ for _ in range(2, 128 + 1, 16): m = MinHash() m.update("abcdefg".encode("utf8")) m.update("1234567".encode("utf8")) async with AsyncMinHashLSH( storage_config=self._storage_config_redis, num_perm=128) as lsh: await lsh.insert("m", m) sizes = [ len(H) for ht in lsh.hashtables for H in await ht.keys() ] self.assertTrue(all(sizes[0] == s for s in sizes))
async def test_query_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) result = await lsh.query(m1) self.assertTrue("a" in result) result = await lsh.query(m2) self.assertTrue("b" in result) mg = WeightedMinHashGenerator(10, 5) m3 = mg.minhash(np.random.uniform(1, 10, 10)) with self.assertRaises(ValueError): await lsh.query(m3)
async def test_arbitrary_collection(self): self._storage_config_mongo["mongo"][ "collection_name"] = "unit_test_collection" async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update(b"a") await lsh.insert("a", m1) dsn = MONGO_URL or "mongodb://{host}:{port}/{db}".format( **self._storage_config_mongo["mongo"]) collection = AsyncIOMotorClient(dsn).get_default_database( "lsh_test").get_collection("unit_test_collection") count = await collection.count_documents({}) self.assertGreaterEqual(count, 1) del self._storage_config_mongo["mongo"]["collection_name"]
async def test_remove_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=4) as lsh: mg = WeightedMinHashGenerator(10, 4) m1 = mg.minhash(np.random.uniform(1, 10, 10)) m2 = mg.minhash(np.random.uniform(1, 10, 10)) await lsh.insert("a", m1) await lsh.insert("b", m2) await lsh.remove("a") self.assertTrue(not await lsh.has_key("a")) for table in lsh.hashtables: for H in await table.keys(): self.assertGreater(len(await table.get(H)), 0) self.assertTrue("a" not in await table.get(H)) with self.assertRaises(ValueError): await lsh.remove("c")
async def test_arbitrary_url(self): config = { "type": "aiomongo", "mongo": { "url": MONGO_URL or "mongodb://localhost/lsh_test" } } async with AsyncMinHashLSH(storage_config=config, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update(b"a") await lsh.insert("a", m1) database = AsyncIOMotorClient( config["mongo"]["url"]).get_default_database("lsh_test") collection_names = await database.list_collection_names() self.assertGreater(len(collection_names), 0) await database.client.drop_database(database.name)
async def test_query_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) m3 = MinHash(16) m3.update("b".encode("utf8")) fs = (lsh.insert("a", m1, check_duplication=False), lsh.insert("b", m2, check_duplication=False), lsh.insert("b", m3, check_duplication=False)) await asyncio.gather(*fs) result = await lsh.query(m1) self.assertTrue("a" in result) result = await lsh.query(m2) self.assertTrue("b" in result) m3 = MinHash(18) with self.assertRaises(ValueError): await lsh.query(m3)
async def test_remove_mongo(self): async with AsyncMinHashLSH(storage_config=self._storage_config_mongo, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) await lsh.insert("a", m1) await lsh.insert("b", m2) await lsh.remove("a") self.assertTrue(not await lsh.has_key("a")) for table in lsh.hashtables: for H in await table.keys(): self.assertGreater(len(await table.get(H)), 0) self.assertTrue("a" not in await table.get(H)) with self.assertRaises(ValueError): await lsh.remove("c")
async def func(): async with AsyncMinHashLSH(threshold=0.3, num_perm=256, storage_config={ 'type': 'aiomongo', 'basename': 'k'.encode('utf8'), 'mongo': { 'host': 'localhost', 'port': 27017, 'db': 'lsh' } }) as lsh: with codecs.open('dataset.csv', 'r', "utf-8") as file: for line in file: record = line.split(',', 1) test = record[0].lower().split() mh = MinHash(num_perm=256) for d in test: mh.update(d.encode('utf8')) result = await lsh.query(mh) print(record[0], record[1], result)
async def test_insertion_session_redis(self): async with AsyncMinHashLSH(storage_config=self._storage_config_redis, threshold=0.5, num_perm=16) as lsh: m1 = MinHash(16) m1.update("a".encode("utf8")) m2 = MinHash(16) m2.update("b".encode("utf8")) data = [("a", m1), ("b", m2)] async with lsh.insertion_session() as session: for key, minhash in data: await session.insert(key, minhash) for t in lsh.hashtables: self.assertTrue(await t.size() >= 1) items = [] for H in await t.keys(): items.extend(await t.get(H)) self.assertTrue(pickle.dumps("a") in items) self.assertTrue(pickle.dumps("b") in items) self.assertTrue(await lsh.has_key("a")) self.assertTrue(await lsh.has_key("b")) for i, H in enumerate(await lsh.keys.get(pickle.dumps("a"))): res = await lsh.hashtables[i].get(H) self.assertTrue(pickle.dumps("a") in res)
async def insert_aioredis(aiolsh: AsyncMinHashLSH, data: list): fs = (aiolsh.insert(key, minhash, check_duplication=False) for key, minhash in data) await asyncio.gather(*fs)
async def query_aioredis(aiolsh: AsyncMinHashLSH, data: list): fs = (aiolsh.query(minhash) for key, minhash in data) return await asyncio.gather(*fs)
async def insertion_session_aioredis(aiolsh: AsyncMinHashLSH, data: list, batch_size: int): async with aiolsh.insertion_session(batch_size=batch_size) as session: fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data) await asyncio.gather(*fs)