def pack_payload_pandas(partition: pd.DataFrame, group_key: List[str]) -> pd.DataFrame: try: # Technically distributed is an optional dependency from distributed.protocol import serialize_bytes except ImportError: _logger.warning( "Shuffle payload columns cannot be compressed since distributed is not installed." ) return partition if partition.empty: res = partition[group_key] res[_PAYLOAD_COL] = b"" else: res = partition.groupby( group_key, sort=False, observed=True, # Keep the as_index s.t. the group values are not dropped. With this # the behaviour seems to be consistent along pandas versions as_index=True, ).apply(lambda x: pd.Series({_PAYLOAD_COL: serialize_bytes(x)})) res = res.reset_index() return res
def test_serialize_bytes(kwargs): for x in [ 1, "abc", np.arange(5), b"ab" * int(40e6), int(2**26) * b"ab", (int(2**25) * b"ab", int(2**25) * b"ab"), ]: b = serialize_bytes(x, **kwargs) assert isinstance(b, bytes) y = deserialize_bytes(b) assert str(x) == str(y)
def test_serialize_bytes(): for x in [1, 'abc', np.arange(5)]: b = serialize_bytes(x) assert isinstance(b, bytes) y = deserialize_bytes(b) assert str(x) == str(y)
def test_serialize_bytes(): for x in [1, "abc", np.arange(5), b"ab" * int(40e6)]: b = serialize_bytes(x) assert isinstance(b, bytes) y = deserialize_bytes(b) assert str(x) == str(y)
def _serialize_if_device(obj): """ Serialize an object if it's a device object """ if is_device_object(obj): return serialize_bytes(obj, on_error="raise") else: return obj