예제 #1
0
def deserialize_minhash(column):
    """
    Deserializes minhash binary file for the given column and returns the minhash
    @param column:
    @return:
    """
    file_path = f'{os.environ["WORKING_DIRECTORY"]}/results/minhashes/{column["table"]}.{column["column"]}.txt'
    if not os.path.isfile(file_path):
        serialize_min_hash([column])
    with open(file_path, 'rb') as file:
        minhash = LeanMinHash.deserialize(bytearray(file.read()))
    return minhash
예제 #2
0
    def test_deserialize(self):
        m1 = MinHash(10, 1, hashfunc=fake_hash_func)
        m1.update(123)
        lm1 = LeanMinHash(m1)
        buf = bytearray(lm1.bytesize())
        lm1.serialize(buf)

        # Test if we get back the exact same LeanMinHash objects after
        # deserializing from bytes
        lm1d = LeanMinHash.deserialize(buf)
        self.assertEqual(lm1d.seed, lm1.seed)
        self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
        self.assertTrue(
            all(hvd == hv for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))
예제 #3
0
    def test_deserialize_byteorder(self):
        for byteorder in "@=<>!":
            m1 = MinHash(10, 1, hashobj=FakeHash)
            m1.update(123)
            lm1 = LeanMinHash(m1)
            buf = bytearray(lm1.bytesize(byteorder))
            lm1.serialize(buf, byteorder)

            # Test if we get back the exact same LeanMinHash objects after
            # deserializing from bytes
            lm1d = LeanMinHash.deserialize(buf, byteorder)
            lm1d.hashobj = FakeHash
            self.assertEqual(lm1d.seed, lm1.seed)
            self.assertEqual(len(lm1d.hashvalues), len(lm1.hashvalues))
            self.assertTrue(
                all(hvd == hv
                    for hv, hvd in zip(lm1.hashvalues, lm1d.hashvalues)))