def test_serialize(self): m1 = MinHash(2, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize()) # Only test for syntax lm1.serialize(buf) m2 = MinHash(2, 1, hashfunc=fake_hash_func) lm2 = LeanMinHash(m2) size = lm1.bytesize() buf = bytearray(size * 2) lm1.serialize(buf) lm2.serialize(buf[size:])
def test_deserialize(self): m1 = MinHash(10, 1, hashfunc=fake_hash_func) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize()) lm1.serialize(buf) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf) self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
def test_deserialize_byteorder(self): for byteorder in "@=<>!": m1 = MinHash(10, 1, hashobj=FakeHash) m1.update(123) lm1 = LeanMinHash(m1) buf = bytearray(lm1.bytesize(byteorder)) lm1.serialize(buf, byteorder) # Test if we get back the exact same LeanMinHash objects after # deserializing from bytes lm1d = LeanMinHash.deserialize(buf, byteorder) lm1d.hashobj = FakeHash self.assertEqual(lm1d.seed, lm1.seed) self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues)) self.assertTrue( all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
def serialize_min_hash(columns, override=False): """ Writes min hash values to local files @param override: @param columns: @return: """ for column in columns: output_file = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt' if os.path.isfile(output_file) and not override: continue values = queryDatabase.get_distnct_column_values(column['table'], column) tokens = tokenize(values) minhash = MinHash(num_perm=NUM_PERM) for token in tokens: minhash.update(token.encode('utf8')) leanMinHash = LeanMinHash(minhash) buf = bytearray(leanMinHash.bytesize()) leanMinHash.serialize(buf) with open(output_file, 'wb') as file: file.write(buf) print(f'Serialization is complete for {column["table"]}.{column["column"]}.') return
def test_bytesize(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) self.assertTrue(lm1.bytesize() == (4 * 4) + 4 + 8)
def test_bytesize(self): m1 = MinHash(4, 1, hashobj=FakeHash) lm1 = LeanMinHash(m1) self.assertTrue(lm1.bytesize() == (4 * 4) + 4 + 8)