def _deserialize_output_memory(obj_id: plasma.ObjectID, client: plasma.PlasmaClient) -> Any: """Gets data from memory. Args: obj_id: The ID of the object to retrieve from the plasma store. client: A PlasmaClient to interface with the in-memory object store. Returns: The unserialized data from the store corresponding to the `obj_id`. Raises: ObjectNotFoundError: If the specified `obj_id` is not in the store. ValueError: If the serialization type in the metadata is not valid. """ obj_ids = [obj_id] # TODO: the get_buffers allows for batch, which we want to use in # the future. buffers = client.get_buffers(obj_ids, with_meta=True, timeout_ms=1000) # Since we currently know that we are only restrieving one buffer, # we can instantly get its metadata and buffer. metadata, buffer = buffers[0] # Getting the buffer timed out. We conclude that the object has not # yet been written to the store and maybe never will. if metadata is None and buffer is None: raise ObjectNotFoundError( f'Object with ObjectID "{obj_id}" does not exist in store.') metadata = metadata.decode("utf-8").split(Config.__METADATA_SEPARATOR__) _, _, serialization, _ = metadata if serialization == Serialization.ARROW_TABLE.name: # Read all batches as a table. stream = pa.ipc.open_stream(buffer) return stream.read_all() elif serialization == Serialization.ARROW_BATCH.name: # Return the first batch (the only one). stream = pa.ipc.open_stream(buffer) return [b for b in stream][0] elif serialization == Serialization.PICKLE.name: # Can load the buffer directly because its a bytes-like-object: # https://docs.python.org/3/library/pickle.html#pickle.loads return pickle.loads(buffer) else: raise ValueError( "Object was serialized with an unsupported serialization")
def get_actual_capacity(plasma_client: plasma.PlasmaClient) -> int: """ Get actual capacity of plasma store Parameters ---------- plasma_client: PlasmaClient Plasma client. Returns ------- size: int Actual storage size in bytes """ store_limit = plasma_client.store_capacity() left_size = store_limit alloc_fraction = 1 while True: allocate_size = int(left_size * alloc_fraction / PAGE_SIZE) * PAGE_SIZE try: obj_id = plasma.ObjectID.from_random() buf = [plasma_client.create(obj_id, allocate_size)] plasma_client.seal(obj_id) del buf[:] break except plasma.PlasmaStoreFull: # pragma: no cover alloc_fraction *= 0.99 finally: plasma_client.evict(allocate_size) return allocate_size
def _get_output_memory(obj_id: plasma.ObjectID, client: plasma.PlasmaClient) -> Any: """Gets data from memory. Args: obj_id: The ID of the object to retrieve from the plasma store. client: A PlasmaClient to interface with the in-memory object store. Returns: The unserialized data from the store corresponding to the `obj_id`. Raises: ObjectNotFoundError: If the specified `obj_id` is not in the store. """ obj_ids = [obj_id] # TODO: the get_buffers allows for batch, which we want to use in # the future. buffers = client.get_buffers(obj_ids, with_meta=True, timeout_ms=1000) # Since we currently know that we are only restrieving one buffer, # we can instantly get its metadata and buffer. metadata, buffer = buffers[0] # Getting the buffer timed out. We conclude that the object has not # yet been written to the store and maybe never will. if metadata is None and buffer is None: raise ObjectNotFoundError( f'Object with ObjectID "{obj_id}" does not exist in store.') buffers_bytes = buffer.to_pybytes() obj = pa.deserialize(buffers_bytes) # If the metadata stated that the object was pickled, then we need # to additionally unpickle the obj. if metadata == bytes(f"{Config.IDENTIFIER_SERIALIZATION};arrowpickle", "utf-8"): obj = pickle.loads(obj) return obj
def _output_to_memory( obj: pa.SerializedPyObject, client: plasma.PlasmaClient, obj_id: Optional[plasma.ObjectID] = None, metadata: Optional[bytes] = None, memcopy_threads: int = 6, ) -> plasma.ObjectID: """Outputs an object to memory. Args: obj: Object to output to memory. client: A PlasmaClient to interface with the in-memory object store. obj_id: The ID to assign to the `obj` inside the plasma store. If ``None`` then one is randomly generated. metadata: Metadata to add to the `obj` inside the store. memcopy_threads: The number of threads to use to write the `obj` into the object store for large objects. Returns: The ID of the object inside the store. Either the given `obj_id` or a randomly generated one. Raises: MemoryError: If the `obj` does not fit in memory. """ # Check whether the object to be passed in memory actually fits in # memory. We check explicitely instead of trying to insert it, # because inserting an already full Plasma store will start evicting # objects to free up space. However, we want to maintain control # over what objects get evicted. total_size = obj.total_bytes + len(metadata) occupied_size = sum(obj["data_size"] + obj["metadata_size"] for obj in client.list().values()) # Take a percentage of the maximum capacity such that the message # for object eviction always fits inside the store. store_capacity = Config.MAX_RELATIVE_STORE_CAPACITY * client.store_capacity( ) available_size = store_capacity - occupied_size if total_size > available_size: raise MemoryError("Object does not fit in memory") # In case no `obj_id` is specified, one has to be generated because # an ID is required for an object to be inserted in the store. if obj_id is None: obj_id = plasma.ObjectID.from_random() # Write the object to the plasma store. If the obj_id already # exists, then it first has to be deleted. Essentially we are # overwriting the data (just like we do for disk) try: buffer = client.create(obj_id, obj.total_bytes, metadata=metadata) except plasma.PlasmaObjectExists: client.delete([obj_id]) buffer = client.create(obj_id, obj.total_bytes, metadata=metadata) stream = pa.FixedSizeBufferWriter(buffer) stream.set_memcopy_threads(memcopy_threads) obj.write_to(stream) client.seal(obj_id) return obj_id