示例#1
0
def _deserialize_output_memory(obj_id: plasma.ObjectID,
                               client: plasma.PlasmaClient) -> Any:
    """Gets data from memory.

    Args:
        obj_id: The ID of the object to retrieve from the plasma store.
        client: A PlasmaClient to interface with the in-memory object
            store.

    Returns:
        The unserialized data from the store corresponding to the
        `obj_id`.

    Raises:
        ObjectNotFoundError: If the specified `obj_id` is not in the
            store.
        ValueError: If the serialization type in the metadata is not
            valid.
    """
    obj_ids = [obj_id]

    # TODO: the get_buffers allows for batch, which we want to use in
    #       the future.
    buffers = client.get_buffers(obj_ids, with_meta=True, timeout_ms=1000)

    # Since we currently know that we are only restrieving one buffer,
    # we can instantly get its metadata and buffer.
    metadata, buffer = buffers[0]

    # Getting the buffer timed out. We conclude that the object has not
    # yet been written to the store and maybe never will.
    if metadata is None and buffer is None:
        raise ObjectNotFoundError(
            f'Object with ObjectID "{obj_id}" does not exist in store.')

    metadata = metadata.decode("utf-8").split(Config.__METADATA_SEPARATOR__)
    _, _, serialization, _ = metadata
    if serialization == Serialization.ARROW_TABLE.name:
        # Read all batches as a table.
        stream = pa.ipc.open_stream(buffer)
        return stream.read_all()

    elif serialization == Serialization.ARROW_BATCH.name:
        # Return the first batch (the only one).
        stream = pa.ipc.open_stream(buffer)
        return [b for b in stream][0]

    elif serialization == Serialization.PICKLE.name:
        # Can load the buffer directly because its a bytes-like-object:
        # https://docs.python.org/3/library/pickle.html#pickle.loads
        return pickle.loads(buffer)

    else:
        raise ValueError(
            "Object was serialized with an unsupported serialization")
示例#2
0
文件: plasma.py 项目: edgar87/mars
def get_actual_capacity(plasma_client: plasma.PlasmaClient) -> int:
    """
    Get actual capacity of plasma store

    Parameters
    ----------
    plasma_client: PlasmaClient
        Plasma client.

    Returns
    -------
    size: int
        Actual storage size in bytes
    """
    store_limit = plasma_client.store_capacity()

    left_size = store_limit
    alloc_fraction = 1
    while True:
        allocate_size = int(left_size * alloc_fraction / PAGE_SIZE) * PAGE_SIZE
        try:
            obj_id = plasma.ObjectID.from_random()
            buf = [plasma_client.create(obj_id, allocate_size)]
            plasma_client.seal(obj_id)
            del buf[:]
            break
        except plasma.PlasmaStoreFull:  # pragma: no cover
            alloc_fraction *= 0.99
        finally:
            plasma_client.evict(allocate_size)
    return allocate_size
示例#3
0
def _get_output_memory(obj_id: plasma.ObjectID,
                       client: plasma.PlasmaClient) -> Any:
    """Gets data from memory.

    Args:
        obj_id: The ID of the object to retrieve from the plasma store.
        client: A PlasmaClient to interface with the in-memory object
            store.

    Returns:
        The unserialized data from the store corresponding to the
        `obj_id`.

    Raises:
        ObjectNotFoundError: If the specified `obj_id` is not in the
            store.
    """
    obj_ids = [obj_id]

    # TODO: the get_buffers allows for batch, which we want to use in
    #       the future.
    buffers = client.get_buffers(obj_ids, with_meta=True, timeout_ms=1000)

    # Since we currently know that we are only restrieving one buffer,
    # we can instantly get its metadata and buffer.
    metadata, buffer = buffers[0]

    # Getting the buffer timed out. We conclude that the object has not
    # yet been written to the store and maybe never will.
    if metadata is None and buffer is None:
        raise ObjectNotFoundError(
            f'Object with ObjectID "{obj_id}" does not exist in store.')

    buffers_bytes = buffer.to_pybytes()
    obj = pa.deserialize(buffers_bytes)

    # If the metadata stated that the object was pickled, then we need
    # to additionally unpickle the obj.
    if metadata == bytes(f"{Config.IDENTIFIER_SERIALIZATION};arrowpickle",
                         "utf-8"):
        obj = pickle.loads(obj)

    return obj
示例#4
0
def _output_to_memory(
    obj: pa.SerializedPyObject,
    client: plasma.PlasmaClient,
    obj_id: Optional[plasma.ObjectID] = None,
    metadata: Optional[bytes] = None,
    memcopy_threads: int = 6,
) -> plasma.ObjectID:
    """Outputs an object to memory.

    Args:
        obj: Object to output to memory.
        client: A PlasmaClient to interface with the in-memory object
            store.
        obj_id: The ID to assign to the `obj` inside the plasma store.
            If ``None`` then one is randomly generated.
        metadata: Metadata to add to the `obj` inside the store.
        memcopy_threads: The number of threads to use to write the
            `obj` into the object store for large objects.

    Returns:
        The ID of the object inside the store. Either the given `obj_id`
        or a randomly generated one.

    Raises:
        MemoryError: If the `obj` does not fit in memory.
    """
    # Check whether the object to be passed in memory actually fits in
    # memory. We check explicitely instead of trying to insert it,
    # because inserting an already full Plasma store will start evicting
    # objects to free up space. However, we want to maintain control
    # over what objects get evicted.
    total_size = obj.total_bytes + len(metadata)

    occupied_size = sum(obj["data_size"] + obj["metadata_size"]
                        for obj in client.list().values())
    # Take a percentage of the maximum capacity such that the message
    # for object eviction always fits inside the store.
    store_capacity = Config.MAX_RELATIVE_STORE_CAPACITY * client.store_capacity(
    )
    available_size = store_capacity - occupied_size

    if total_size > available_size:
        raise MemoryError("Object does not fit in memory")

    # In case no `obj_id` is specified, one has to be generated because
    # an ID is required for an object to be inserted in the store.
    if obj_id is None:
        obj_id = plasma.ObjectID.from_random()

    # Write the object to the plasma store. If the obj_id already
    # exists, then it first has to be deleted. Essentially we are
    # overwriting the data (just like we do for disk)
    try:
        buffer = client.create(obj_id, obj.total_bytes, metadata=metadata)
    except plasma.PlasmaObjectExists:
        client.delete([obj_id])
        buffer = client.create(obj_id, obj.total_bytes, metadata=metadata)

    stream = pa.FixedSizeBufferWriter(buffer)
    stream.set_memcopy_threads(memcopy_threads)

    obj.write_to(stream)
    client.seal(obj_id)

    return obj_id