예제 #1
0
    def test_deserialize(self):
        m1 = MinHash(10, 1, hashfunc=fake_hash_func)
        m1.update(123)
        lm1 = LeanMinHash(m1)
        buf = bytearray(lm1.bytesize())
        lm1.serialize(buf)

        # Test if we get back the exact same LeanMinHash objects after
        # deserializing from bytes
        lm1d = LeanMinHash.deserialize(buf)
        self.assertEqual(lm1d.seed, lm1.seed)
        self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
        self.assertTrue(
            all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
예제 #2
0
    def test_deserialize_byteorder(self):
        for byteorder in "@=<>!":
            m1 = MinHash(10, 1, hashobj=FakeHash)
            m1.update(123)
            lm1 = LeanMinHash(m1)
            buf = bytearray(lm1.bytesize(byteorder))
            lm1.serialize(buf, byteorder)

            # Test if we get back the exact same LeanMinHash objects after
            # deserializing from bytes
            lm1d = LeanMinHash.deserialize(buf, byteorder)
            lm1d.hashobj = FakeHash
            self.assertEqual(lm1d.seed, lm1.seed)
            self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
            self.assertTrue(
                all(hvd == hv
                    for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
예제 #3
0
def serialize_min_hash(columns, override=False):
    """
    Writes min hash values to local files
    @param override:
    @param columns:
    @return:
    """
    for column in columns:
        output_file = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt'
        if os.path.isfile(output_file) and not override:
            continue
        values = queryDatabase.get_distnct_column_values(column['table'], column)
        tokens = tokenize(values)
        minhash = MinHash(num_perm=NUM_PERM)
        for token in tokens:
            minhash.update(token.encode('utf8'))
        leanMinHash = LeanMinHash(minhash)
        buf = bytearray(leanMinHash.bytesize())
        leanMinHash.serialize(buf)
        with open(output_file, 'wb') as file:
            file.write(buf)
            print(f'Serialization is complete for {column["table"]}.{column["column"]}.')
    return
예제 #4
0
    def test_serialize(self):
        m1 = MinHash(2, 1, hashfunc=fake_hash_func)
        lm1 = LeanMinHash(m1)
        buf = bytearray(lm1.bytesize())
        # Only test for syntax
        lm1.serialize(buf)

        m2 = MinHash(2, 1, hashfunc=fake_hash_func)
        lm2 = LeanMinHash(m2)
        size = lm1.bytesize()
        buf = bytearray(size * 2)
        lm1.serialize(buf)
        lm2.serialize(buf[size:])