Exemplo n.º 1
0
def rollout(envs, actor, *, batch_size=1, device=None):
    it = episode(envs, actor, device=device)
    try:
        buffer, queue, base, t = [], [], 0, 0
        beginning = numpy.zeros(len(envs), int)
        while True:
            # yield to the caller if we've prepared a batch
            if len(queue) >= batch_size:
                # TODO a better way to collate differently sized histories
                yield queue[:batch_size]

                queue = queue[batch_size:]

            # make the next step
            cur, hx, act, nxt, env = suply(torch.clone, next(it))
            buffer.append((cur, act, env))
            t += 1

            terminated = nxt.fin.nonzero(as_tuple=True)[-1]
            if len(terminated) < 1:
                continue

            # rebuild terminated trajectories
            for j in map(int, terminated):
                t0, t1 = beginning[j] - base, t - base
                nxt.fin[:, j] = False

                # extract the j-th trajectory and its terminal state
                buf_, terminal, hx_ = suply(getitem, (buffer[t0:t1], nxt, hx),
                                            index=(slice(None), [j]))
                state, act_, env_ = zip(*buf_)

                # react to the terminal state
                _, _, info_ = actor.step(*terminal, hx=hx_, virtual=True)

                # stack everything into [T x 1 x ...] tensors
                queue.append(
                    Fragment(
                        tuply(torch.stack, *state, terminal),  # state t=0..T
                        tuply(torch.stack, *act_, info_),  # actor t=0..T
                        tuply(torch.stack, *env_),  # env   t=1..T
                        actor.reset(hx_, at=0),  # hx
                    ))

                # make the beginning of the new trajectory (which will be
                #  actually copied from `cur` at the next iteration)
                beginning[j] = t

            # reduce the size of the buffer by slicing at the start of
            #  currently the oldest trajectory
            base_ = beginning.min()
            if base_ > base:
                # we can reduce the size by `base_ - base`
                buffer, base = buffer[base_ - base:], base_

    finally:
        it.close()
Exemplo n.º 2
0
def npy_copy_(
    dst,
    src,
    *,
    at=None,
    _copy=numpy.copyto,
):
    """Copy numpy data between nested objects with IDENTICAL structure."""
    if at is not None:
        dst = suply(getitem, dst, index=at)

    suply(_copy, dst, src, casting='same_kind')
Exemplo n.º 3
0
def pyt_copy_(
    dst,
    src,
    *,
    at=None,
    _copy=torch.Tensor.copy_,
):
    """Copy tensor data from the `src` nested object into the `dst` object with
    IDENTICAL structure at the specified index (int, tuple of ints, or slices).
    """
    if at is not None:
        dst = suply(getitem, dst, index=at)

    suply(_copy, dst, src)
Exemplo n.º 4
0
def rollout(envs, actor, n_steps=51, *, sticky=False, device=None):
    """"""
    # always pin the runtime context if the device is 'cuda'
    device = torch.device('cpu') if device is None else device
    pinned = device.type == 'cuda'

    # initialize a buffer for one rollout fragment (optionally pinned)
    buffer = prepare(envs[0],
                     actor,
                     n_steps,
                     len(envs),
                     pinned=pinned,
                     device=device)

    # the running context tor the actor and the envs
    ctx, fragment = startup(envs, actor, buffer, pinned=pinned)
    try:
        while True:
            # collect the fragment
            collect(envs, actor, fragment, ctx, sticky=sticky, device=device)

            # move to the specified device
            batch = fragment.pyt
            if device.type == 'cuda':
                batch = suply(torch.Tensor.to,
                              fragment.pyt,
                              device=device,
                              non_blocking=True)

            yield batch

    finally:
        pass
Exemplo n.º 5
0
    def reset(self, hx, at):
        """Reset the specified pieces of actor's recurrent context `hx`.

        Parameters
        ----------
        hx : nested object with tensor data
            The recurrent state is a nested object with at least 2d data of
            shape `: x n_envs x ...`, i.e. having `n_envs` as their second dim.

        at : int, or slice
            The index or range of environments for which to reset the recurrent
            context.

        Returns
        -------
        hx : nested object with tensor data
            The new updated recurrent state, which has the slices related to
            the specified environment `at` reset to the initial hidden state.
            The structure of the original nested object is PRESERVED.

        Details
        -------
        Similar to `torch.nn.LSTM` and other recurrent layers [citation_needed]
        we assume the initial recurrent state to be a ZERO non-differentiable
        tensor.

        The seemingly awkward shapes of the tensors in `hx` stem from the
        shapes of hidden states in `torch.nn`-s recurrent layers. For example,
        the LSTM layer has `hx = (c, h)`, with both tensors shaped like

            (num_layers * num_directions) x n_batch x n_hidden,

        while the GRU layer's `hx` is just a single tensor, not a tuple.
        """
        assert hx is not None, 'Pass `hx=None` to `.step` for initialization.'

        # make sure to make a copy of the recurrent state
        hx_ = suply(torch.Tensor.clone, hx)
        suply(lambda x: x[:, at].zero_(), hx_)

        # XXX could the actor keep `hx` unchanged in `forward`? No, under the
        # current API, since `.fin` reflects that the input is related to a
        # freshly started trajectory, i.e. only $h_t$ and $x_t$ are defined,
        # while $a_{t-1}$, $r_t$, and $d_t$ are not. `fin` does not tell us
        # anything about whether $h_{t+1}$ should be reset after the actor's
        # update $(t, x_t, a_{t-1}, r_t, d_t, h_t) \to (a_t, h_{t+1})$ or not.
        return hx_
Exemplo n.º 6
0
    def from_structured(cls, struct, *leading, ctx=multiprocessing):
        """Recursively allocate array in shared memory."""
        def _empty_like(npy):
            if isinstance(npy, numpy.ndarray):
                return cls.from_numpy(npy, *leading, ctx=ctx)
            raise TypeError(f'Unrecognized type `{type(npy)}`')

        return suply(_empty_like, struct)
Exemplo n.º 7
0
    def to_structured(cls, struct):
        """Recursively rebuild the shared structured data."""
        def _build(sh):
            if isinstance(sh, cls):
                return sh.numpy()
            raise TypeError(f'Unrecognized type `{type(sh)}`')

        return suply(_build, struct)
Exemplo n.º 8
0
def torchify(obj, *leading, copy=False, pinned=False, shared=False):
    """Convert the values in the nested container into torch tensors.

    Parameters
    ----------
    obj: any
        The nested container (dict, list, tuple, or namedtuple) of data.

    *leading: tuple of int
        The extra leading dimensions to prepend to the shape of the resulting
        tensors residing within the nested container.

    copy: bool, default=False
        Forces a copy even in the case when one is not necessary, which
        disables torch-numpy memory aliasing altogether, if the original data
        were a numpy array.

    pinned: bool, default=False
        The underlying storage of the newly created tensor resides in pinned
        memory (non-paged) for faster host-device transfers. If the original
        value was a numpy array, then no aliasing is possible, and a copy
        is made.

        Cannot be used with `shared=True`.

    shared: bool, default=False
        Allocates the underlying storage of new tensors using torch's memory
        interprocess memory sharing logic which makes it so that all changes
        to the data are reflected between all processes. If the original value
        was a numpy array, then no aliasing is possible, and a copy is made.

        Cannot be used with `pinned=True`.

    Returns
    -------
    obj: any
        The nested container with torch tensors, optionally shared or pinned.

    Details
    -------
    `torch.as_tensor` creates new tensors from lists of numeric data, and
    lets tensors and arrays through (no copy). The resulting torch tensor and
    the original numpy array alias the same memory, hence changes to one are
    reflected in the other UNLESS a copy has been made due to pinned or shared
    flags.

    This function does not copy the original torch tensors or numpy arrays,
    UNLESS extra leading non-broadcasting dims were specified or moving to
    shared or pinned memory was requested. If a copy is made then memory
    aliasing between torch tensors and numpy arrays is IMPOSSIBLE.

    As of 1.8 torch doesn't support string data types.

    Warning
    -------
    Torch automatically detects if a tensor is being shared and hot-swaps
    correctly allocated shared storage. Hence even when torchified with
    `shared=False` the nested container's tensors will be correctly shared
    between processes. However any numpy array aliases created prior to
    sharing will still reference the swapped-out invalidated torch's storage.
    So it is advisable to preemptively torchify the data within shared memory.
    """
    if pinned and shared:
        raise ValueError('`pinned` and `shared` flags are mutually exclusive.')

    assert all(isinstance(j, int) for j in leading)

    # not is_broadcast => leading has at least one non unit dim => copy
    # pinned => copy, because it requires allocating in non-paged memory
    is_broadcast = all(j == 1 for j in leading)  # also True if not leading

    def _as_tensor(x):
        # alias numpy arrays, keep tensors intact, create new from scalars
        pyt = torch.as_tensor(x)

        # warn that some data is on device, and cannot be aliased
        assert pyt.device == torch.device('cpu')

        # do not make a copy, unless we broadcast and unpinned memory is ok
        if not copy and not (pinned or shared) and is_broadcast:
            if leading:
                pyt = pyt.reshape(leading + pyt.shape)
            return pyt

        # allocate a new cpu tensor optionally in the pinned memory (non-paged)
        # XXX uses PinnedMemoryAllocator `/aten/src/ATen/utils.cpp#L44`
        # XXX also see `torch.Storage.pin_memory`
        out = torch.empty(leading + pyt.shape,
                          dtype=pyt.dtype,
                          pin_memory=(pinned and not shared))

        out.copy_(pyt)  # chaining is less readable (`out.copy_` returns `out`)
        if shared:
            # move the storage to torch's shared memory (preserving the data)
            # this allocates shared storage, copies data to it, replaces the old storage
            # XXX see `/torch/csrc/generic/StorageSharing.cpp#L76-L111`:shareFilename
            # as of 1.8 we cannot allocate new tensors in shared memory
            out.share_memory_()

        return out

    return suply(_as_tensor, obj)
Exemplo n.º 9
0
def numpify(obj, *leading, copy=False, ctx=None):
    """Convert the values in the nested container into numpy arrays.

    Parameters
    ----------
    obj: any
        The nested container (dict, list, tuple, or namedtuple) of data.

    *leading: tuple of int
        The extra leading dimensions to prepend to the shape of the resulting
        arrays residing within the nested container.

    copy: bool, default=False
        Forces a copy even in the case when one is not necessary, which
        disables torch-numpy memory aliasing altogether, if the original data
        were a torch tensor.

    ctx: multiprocessing context, default=None
        If `ctx` is not None, then allocate the newly created array in the
        shared memory (managed by the ctx multiprocessing context) and copy
        the original data into it.

        torch has slightly better developed memory sharing functionality,
        hence it is recommended to use `torchify` for sharing.

    Returns
    -------
    obj: any
        The nested container with numpy arrays, optionally in shared memory.

    Details
    -------
    We use `numpy.asarray` to create new array from lists of numeric or string
    data, leaving numpy arrays intact, and aliasing torch tensors as arrays. In
    the latter case the resulting numpy array and the original torch tensor
    reference the same underlying data storage, hence changes to one will be
    reflected in the other.

    This function does not copy the original torch tensors or numpy arrays,
    UNLESS extra leading non-broadcasting dims were specified or moving to
    shared memory was requested (for sharing between processes). If a copy is
    made then memory aliasing with corresponding torch tensors is DISABLED.
    """

    assert all(isinstance(j, int) for j in leading)
    # not is_broadcast => leading has at least one non unit dim => copy
    # shared => copy, because it requires allocating in shared memory
    is_broadcast = all(j == 1 for j in leading)

    def _as_array(x):
        # alias torch tensors, keep arrays intact, create new from scalars
        npy = numpy.asarray(x)
        if not copy and ctx is None and is_broadcast:
            if leading:
                npy = npy.reshape(leading + npy.shape)
            return npy

        # allocate `nbytes` of shared memory, or use numpy's default allocator
        n_bytes, buffer = int(numpy.prod(leading)) * npy.nbytes, None
        if ctx is not None:
            buffer = ctx.RawArray(c_byte, n_bytes)

        # create an uninitialized array and copy the original data into it
        out = numpy.ndarray(leading + npy.shape,
                            dtype=npy.dtype,
                            buffer=buffer)
        numpy.copyto(dst=out, src=npy, casting='no')

        return out

    return suply(_as_array, obj)
Exemplo n.º 10
0
def rollout(factory,
            actor,
            n_steps,
            n_envs,
            *,
            sticky=False,
            close=False,
            clone=True,
            device=None,
            start_method=None,
            affinity=None,
            entropy=None):
    r"""UPDATE THE DOC

    Details
    -------
    We use double buffering approach with an independent actor per buffer. This
    setup allows for truly parallel collection and learning. The worker (W) and
    the learner (L) move in lockstep: W is collecting rollout into one buffer,
    while L is training on the other buffer. This tandem movement is achieved
    via barrier synchronisation.

        +--------------------+           +--------------------+
        | worker process     |  sharing  | learner process    |     sync
        +--------------------+           +--------------------+
        |                    |           | wait for the first |
        | collect(a_1, b_1)  |           | buffer to be ready |
        |                    |           |  (at the barrier)  |
        +====================+           +====================+  <- barrier
        |                    ---- b_1 -->> train(l, b_1) ---+ |
        | collect(a_2, b_2)  |           |                  | |
        |                    <<-- a_1 ---- update(l, a_1) <-+ |
        +====================+           +====================+  <- barrier
        |                    ---- b_2 -->> train(l, b_2) ---+ |
        | collect(a_1, b_1)  |           |                  | |
        |                    <<-- a_2 ---- update(l, a_2) <-+ |
        +====================+           +====================+  <- barrier
        |                    ---- b_1 -->> train(l, b_1) ---+ |
        | collect(a_2, b_2)  |           |                  | |
        |                    <<-- a_1 ---- update(l, a_1) <-+ |
        +====================+           +====================+  <- barrier
        |    ...      ...    |           |    ...      ...    |

    The dual-actor setup eliminates worker's post-collection idle time, that
    exists in a single-actor setup, which stems from the need of the learner
    to acquire a lock on that single actor's state-dict. Although state-dict
    updates via torch's shared memory storage are fast, so the idle period is
    relatively short. At the same time it takes twice as much memory, since
    we have to maintain TWO instances of the actor model.

    Single-actor setup (assuming non-negligible `train` time).

        +--------------------+           +--------------------+
        | worker process     |  sharing  | learner process    |     sync
        +--------------------+           +--------------------+
        | with a.lock:       |           | wait for the first |
        |   collect(a, b_1)  |           | buffer to be ready |
        |                    |           |  (at the barrier)  |
        +====================+           +====================+  <- barrier
        | with a.lock:       ---- b_1 -->> train(l, b_1) ---+ |
        |   collect(a, b_2)  |           |                  | |
        |                    |           | with a.lock:     | |
        | # idle time        <<--  a  ----   update(l, a) <-+ |
        +====================+           +====================+  <- barrier
        | with a.lock:       ---- b_2 -->> train(l, b_2) ---+ |
        |   collect(a, b_1)  |           |                  | |
        |                    |           | with a.lock:     | |
        | # idle time        <<--  a  ----   update(l, a) <-+ |
        +====================+           +====================+  <- barrier
        | with a.lock:       ---- b_1 -->> train(l, b_1) ---+ |
        |   collect(a, b_2)  |           |                  | |
        |                    |           | with a.lock:     | |
        | # idle time        <<--  a  ----   update(l, a) <-+ |
        +====================+           +====================+  <- barrier
        |    ...      ...    |           |    ...      ...    |

    """
    check_signature(factory, seed=None)

    # the device to put the batches onto
    device = torch.device('cpu') if device is None else device

    # the device, on which to run the worker subprocess
    if isinstance(affinity, (tuple, list)):
        affinity, *empty = affinity
        assert not empty
    affinity = torch.device('cpu') if affinity is None else affinity

    # get the correct multiprocessing context (torch-friendly)
    mp = get_context(start_method)

    # initialize a reference buffer and make its shared copies
    env = factory(seed=None)  # XXX seed=None here since used only once

    # create a host-resident copy of the module in shared memory, which
    #  serves as a vessel for updating the actors in workers
    shared = deepcopy(actor).cpu().share_memory()

    # a single one-element-batch forward pass through the copy
    batch = prepare(env, shared, n_steps, n_envs, pinned=False, device=None)

    # some environments don't like being closed or deleted, e.g. `nle`
    if close:
        env.close()

    # we use double buffered rollout, with both buffers in the shared memory
    # (shared=True always makes a copy).
    # XXX torch tensors have much simpler pickling/unpickling when sharing
    double = torchify((batch, ) * 2, shared=True)

    # the sync barrier and actor update lock
    ctrl = Control(mp.Lock(), mp.Barrier(2), mp.SimpleQueue())

    # prepare the seed sequence for the wroker
    ss = SeedSequence(entropy)

    # spawn the lone worker subprocess
    p_worker = mp.Process(
        target=p_double,
        daemon=False,
        args=(ss, ctrl, CloudpickleSpawner(factory), double, shared),
        kwargs=dict(clone=clone, sticky=sticky, close=close, device=affinity),
    )
    p_worker.start()

    # now move the batch onto the proper device
    if device.type == 'cuda':
        batch = suply(torch.Tensor.to, batch, device=device, non_blocking=True)

    # the code flow in the loop below and in the `p_double` is designed to
    # synchronize `flipflop` between the worker and the parent.
    flipflop, emergency = 0, False
    try:
        while p_worker.is_alive():
            # ensure consistent update of the shared module
            # XXX tau-moving average update?
            with ctrl.reflock:
                shared.load_state_dict(actor.state_dict(), strict=True)

            # wait for the current fragment to be ready (w.r.t `flipflop`)
            ctrl.barrier.wait()

            # yield the filled buffer and switch to the next one
            suply(torch.Tensor.copy_, batch, double[flipflop])

            yield batch
            flipflop = 1 - flipflop

    except BrokenBarrierError:
        # the worker broke the barrier to indicate an emergency shutdown
        emergency = True

    finally:
        # we break the barrier, if we wish to shut down the worker
        ctrl.barrier.abort()

        p_worker.join()

    if not emergency:
        return

    # handle emergency shutdown
    if not ctrl.error.empty():
        message = ctrl.error.get()

    elif p_worker.exitcode < 0:
        message = signal.Signals(-p_worker.exitcode).name

    else:
        message = f'worker terminated with exit code {p_worker.exitcode}'

    ctrl.error.close()

    raise RuntimeError(message)
Exemplo n.º 11
0
def evaluate(
    envs,
    actor,
    *,
    n_steps=None,
    render=False,
    device=None,
):
    """Evaluate the actor module in the environment.

    Parameters
    ----------
    envs : list of gym.Env
        The stateful evaluation environments to step through.

    actor : BaseActorModule
        The actor, which steps through the batch of environments.

    n_steps : int, default=None
        The maximum number of steps to take in each test environment.
        If `None`, then steps until all environments are done.

    render : bool, default=False
        Whether to render the visualization of the environment interaction.

        WARNING: can only be used in len(envs) == 1

    device : torch.device, default=None
        The device onto which to put the input $x_t$ `obs`, $a_{t-1}$ `act`,
        $r_t$ `rew`, $d_t$ `fin`, and $h_t$ `hx` for the actor when stepping
        through the test environments.

    Returns
    -------
    rewards : float numpy.array, shape = (T, len(envs))
        The sum of the obtained rewards accumulated during the rollout in each
        test environment.

    info : nested object with array data, shape = (1+T, len(envs), ...)
        The actor's afterstate auxiliary information.

    Details
    -------
    This function is very similar to `collect()`, except that it records
    only the rewards from the environment, instead of the full rollout data.
    """

    n_steps = n_steps or float('+inf')
    assert len(envs) == 1 or len(envs) > 1 and not render

    # always pin the runtime context if the device is 'cuda'
    device = torch.device('cpu') if device is None else device
    pinned, on_host = device.type == 'cuda', device.type == 'cpu'

    # prepare a running context for the specified number of envs
    ctx, info_env = context(*envs, pinned=pinned)
    # `ctx` is $x_*, a_{-1}, r_0, \top, h_0$, where `r_0` is undefined

    # fast access to context's aliases
    npy, pyt = ctx.npy, ctx.pyt

    # Allocate an on-device context and recurrent state, if not on 'host'
    pyt_ = pyt
    if not on_host:
        # XXX this also copies data in `pyt` into `pyt_`
        pyt_ = suply(torch.Tensor.to, pyt_, device=device)

    # render ony in case of a single-env evaluation
    fn_render = envs[0].render if len(envs) == 1 and render else lambda: True

    # collect the evaluation data: let the actor init `hx` for us
    rewards, info_actor, done, t, hx = [], [], False, 0, None
    while not done and t < n_steps and fn_render():
        # REACT: $(t, x_t, a_{t-1}, r_t, d_t, h_t) \to a_t$ and commit $a_t$
        act_, hx, info_ = actor.step(*pyt_, hx=hx, virtual=False)

        info_actor.append(suply(torch.Tensor.cpu, info_))

        pyt_copy_(pyt.act, act_)

        # STEP + EMIT: `.step` through a batch of envs
        for j, env in enumerate(envs):
            # cease interaction with terminated envs
            if npy.fin[j] and t > 0:
                npy.rew[j] = 0.
                continue

            # get $(s_t, a_t) \to (s_{t+1}, x_{t+1}, r_{t+1}, d_{t+1})$
            act_ = suply(getitem, npy.act, index=j)
            obs_, rew_, fin_, info_env = env.step(act_)
            npy.stepno[j] += 1
            if fin_:
                npy.stepno[j] = 0  # start a new trajectory

            # update the j-th env's '$x_{t+1}, r_{t+1}, d_{t+1}$ in `ctx`
            suply(setitem, npy.obs, obs_, index=j)
            npy.rew[j], npy.fin[j] = rew_, fin_

        # move the updated `ctx` to its device-resident torch copy
        if pyt_ is not pyt:
            pyt_copy_(pyt_, pyt)

        # stop only if all environments have been terminated
        done = numpy.all(npy.fin)

        rewards.append(npy.rew.copy())
        t += 1

    # the virtual lookahead step
    _, _, info_ = actor.step(*pyt_, hx=hx, virtual=True)
    info_actor.append(suply(torch.Tensor.cpu, info_))

    # return the collected afterstate data in numpy arrays
    info = suply(torch.Tensor.numpy, tuply(torch.cat, *info_actor, dim=0))
    return numpy.stack(rewards, axis=0), info
Exemplo n.º 12
0
def collect(
    envs,
    actor,
    fragment,
    context,
    *,
    sticky=False,
    device=None,
):
    r"""Collect the rollout trajectory fragment by marching the actor and
    the environments in lockstep (`actor` and `envs`, respectively), updating
    `context` and recording everything into `fragment`.

    Parameters
    ----------
    envs : list of gym.Env
        The stateful environments to step through.

    actor : Actor
        The actor, which steps through the batch of environments.

    fragment : aliased Fragment
        The buffer into which the trajectory fragment is recorded *in-place*.

    context : aliased Context
        The most recent state for the actor to react to. Updated *in-place*.

    sticky : bool, default=False
        Whether to stop interacting with a terminated environment until the end
        of the current trajectory fragment, i.e. until the next `collect()`.
        Once an env reached a terminal state, we put a record with its reset
        observation into the trajectory fragment and then stop interacting
        with it until the end of the fragment.

    device : torch.device, or None
        The device onto which to put the input $x_t$ `obs`, $a_{t-1}$ `act`,
        $r_t$ `rew`, $d_t$ `fin`, and $h_t$ `hx`.

    Details
    -------
    Let $s_t$ be the environment's true, possibly, unobservable, state at time
    $t$, and $x_t$ and $r_t$ be the observation and the reward emitted by the
    env's recent transition

        $(s_{t-1}, a_{t-1}) \longrightarrow (s_t, x_t, r_t, d_t, E_t)$,

    with $d_t$ indicating if $s_t$ is terminal and $E_t$ is the environment's
    extra info. The next action $a_t$ is taken at in response to the current
    reward $r_t$ and observation $x_t$, the last action $a_{t-1}$ and actor's
    current recurrent state $h_t$:
    $$
        \underbrace{
            (t, x_t, a_{t-1}, r_t, d_t, h_t)
        }_{z_t}  % actionable state
            \longrightarrow (a_t, h_{t+1}, A_t)
        \,, $$

    with $z_t$ being the actor's actionable state, and $A_t$ -- its afterstate
    info, computed on the history up to and including $t$ and related to
    the composite `actor-env` transition

        $z_t \longrightarrow a_t \longrightarrow s_{t+1}$.

    Let $(s_*, x_*)$ be the environment's true state and the observation just
    after a reset.

    Assume for simplicity that $x_t = s_t$, i.e. the environment is fully
    observed, the actor is non-recurrent and let $h > t$. The present value
    $G_\pi(s_t)$ of the reward flow, starting at $s_t$ and following the policy
    $\pi$ afterwards, is defined as `the t-th return'
    $$
        G_\pi(s_t)
            = \mathbb{E}_\tau
                \sum_{j=t}^{h-1} \gamma^{j-t} r^\dagger_{j+1}
                + \gamma^{h-t} G_\pi(s^\dagger_h)
        \,, $$

    where the trajectory $\tau$
    $$
        \tau = (
            a_t, r_{t+1}, s_{t+1},
            a_{t+1}, ..., r_{h-1}, s_{h-1},
            a_{h-1}, s_h, r_h, ...
        )
        \,\, $$

    with transition dynamics
    $$
        a_j \sim \pi_j(a \mid s_j)
        \,\text{ and }\,
        s_{j+1}, r_{j+1} \sim p(s, r \mid s_j, a_j)
        \,\, $$

    hasn't been is terminated mid-way. Otherwise we consider a `ceased` reward
    and a stopped observation processes, i.e. $r^\dagger$ and $s^\dagger$ are
    `frozen` at the **stopping time** $T(s_t) \geq t$ with
    $$
        r^\dagger_j = 0 if j > T(s_t) else r_j
        \,,
        s^\dagger_j = s_{T(s_t) \wedge j}
        \,\text{ and }\,
        d_j = 1_{j \geq T(s_t)}
        \,. $$

    This is because the rewards following any terminal state are assumed to
    always be zero, since the episode has ended.

    This data is collected by stepping through the actor and the environment in
    lockstep and recording it into the `fragment` rollout buffer, while keeping
    `context` properly synchronised. The time advances after env's `.step`:
    the transitions (simplified)

        $(s_t, a_t) \to s_{t+1}$

    determine the timing in the subscripts. The `fragment.state`, aka `out`,
    with a terminal observation occurring at relative time $t+1$, contains
    the following data (the `stepno` step counter in `.state` is not shown
    for brevity):

    if `sticky` is False

      +---+-----+-----------------------------------------+-----------+
      |   |     |               .state                    | (env's    |
      |   |  #  +-----------------------------------------+  actual   |
      |   |     |  obs       act       rew       fin      |   state)  |
      +---+-----+-----------------------------------------+-----------+
      | f |     |                                         |           |
      | r |   0 |  x_k       a_{k-1}   r_k       d_k      |  s_k      |
      | a | ... |                                         |           |
      | g |   t |  x_t       a_{t-1}   r_t       \bot     |  s_t      |
      | m | t+1 |  x'_*      a_t       r_{t+1}   \top     |  s'_*  <<-- reset
      | e | t+2 |  x'_1      a'_0      r'_1      \bot     |  s'_1     |
      | n | ... |                                         |           |
      | t | N-1 |  x'_{j-1}  a'_{j-2}  r'_{j-1}  d'_{j-1} |  s'_{j-1} |
      |   |   N |  x'_j      a'_{j-1}  r'_j      d'_j     |  s'_j   ------+
      | p |     |                                         |           |   |
      +---+-----+-----------------------------------------+-----------+ clone
      |   |     |                                         |           |   |
      | p |   0 |  x'_j      a'_{j-1}  r'_j      d'_j     |  s'_j  <<-----+
      | + |   1 |  x'_{j+1}  a'_j      r'_{j+1}  d'_{j+1} |  s'_{j+1} |
      | 1 | ... |                                         |           |
      |   |     |                                         |           |
      +---+-----+-----------------------------------------+-----------+

      (the apostrophe indicates a new trajectory within the same fragment)

    Notice that the record `out[t]` still contains a VALID last action $a_t$
    and the true received terminal reward $r_{t+1}$. However this compact data
    recording does not store the true terminal observation $x_t$, because it
    has been overwritten by $x_*$.

    In short, if `out.state.fin[t]` is True, then the action `out.state.act[t]`
    taken from `out.state[t-1]` has led to a terminal state with reward
    `out.state.rew[t]`, and the observation in `out.state.obs[t]` is already
    the initial observation $x_*$ form a newly started trajectory. Otherwise,
    if `out.state.fin[t]` is False, then `out.state[t-1]` and `out.state[t]`
    are truly consecutive records from the same trajectory.

    This non-intuitive indexing notation allows computing the return for the
    action $a_t$ in `out.state.act[t+1]` using

        G[t] = out.state.rew[t+1] + gamma * (1 - out.fin[t+1]) * G[t+1]

    where `G[t] $\approx G_\pi(s_t)$`, and `G[N] = 0` if `out.state.fin[T]` is
    True, and otherwise `G[T]` is $v(s_N)$, i.e. the bootstrap value --
    an estimate of the expected future return $G_\pi(s_{N+1})$ with the current
    approximation of the value function $v$ (under the current policy $\pi$).

    For example, in a complete information MDP, $v(s_N)$ is the approximate
    present value of future reward flow from trajectories starting at state
    $s_N$ and following $\pi$:
    $$
        v(s_N)
            = \mathbb{E}_\pi \bigl(
                r_{N+1} + \gamma r_{N+2} + ...
            \big\vert s_N \bigr)
        \,, $$

    where $r_t$ is the reward due to $(s_{t-1}, a_{t-1}) \to s_t$ transition
    with $a_{t-1}$ being a response to $s_{t-1}$.

    If `sticky` is True, then any interaction with the terminated environment
    is ceased until the start of the next fragment.

      +---+-----+-----------------------------------------+-----------+
      |   |     |               .state                    | (env's    |
      |   |  #  +-----------------------------------------+  actual   |
      |   |     |  obs       act       rew       fin      |   state)  |
      +---+-----+-----------------------------------------+-----------+
      | f |     |                                         |           |
      | r |   0 |  x_k       a_{k-1}   r_k       d_k      |  s_k      |
      | a |    ...                                       ...          |
      | g |   t |  x_t       a_{t-1}   r_t       \bot     |  s_t      |
      | m | t+1 |  x'_*      a_t       r_{t+1}   \top     |  s'_*  <<-- reset
      | e | t+2 |  x'_*      a_t       0         \top     |  s'_*     |
      | n |    ...                                       ...          |
      | t | N-1 |  x'_*      a_t       0         \top     |  s'_*     |
      |   |   N |  x'_*      a_t       0         \top     |  s'_*  -------+
      | p |     |                                         |           |   |
      +---+-----+-----------------------------------------+-----------+ clone
      |   |     |                                         |           |   |
      | p |   0 |  x'_0      a_{-1}    r_0       \top     |  s_0  <<------+
      | + |   1 |  x'_1      a_0       r_1       d_1      |  s_1      |
      | 1 |    ...                                       ...          |
      |   |     |                                         |           |
      +---+-----+-----------------------------------------+-----------+

    This option is more friendly towards CUDNN and torch's packed sequences,
    but it seems that manually stepping through time is more efficient.

    Below we depict the synchronization of the records in the rollout fragment.
    The horizontal arrows indicate the sub-steps of the composite `actor-env`
    transition.

      +---+-----+---------------------------------------+-----------+
      |   |  #  |  .state   .hx  ->  .actor -> .env     | actual    |
      +---+-----+---------------------------------------+-----------+
      | f |     |                                       |           |
      | r |   0 |  Z_k      h_k      A_k       E_{k+1}  |  s_{k+1}  |
      | a |    ...                                     ...          |
      | g |   t |  Z_t      h_t      A_t       E_{t+1}  |  s'_*   <<-- reset
      | m | t+1 |  Z'_0     h'_*     A'_0      E'_1     |  s'_1     |
      | e | t+2 |  Z'_1     h'_1     A'_1      E'_2     |  s'_2     |
      | n |    ...                                     ...          |
      | t | N-1 |  Z'_{j-1} h'_{j-1} A'_{j-1}  E'_j     |  s'_j     |
      |   |   N |  Z'_j     h'_j     A'_\times          |           |
      | p |     |   |        |        |                 |           |
      +---+-----+---|--------|--------X-----------------+-----------+
      |   |     |   V        V                          |           |
      | p |   0 |  Z'_j     h'_j     A'_j      E'_{j+1} |  s'_{j+1} |
      | + |   1 |  Z'_{j+1} h'_{j+1} A'_{j+1}  E'_{j+2} |  s'_{j+2} |
      | 1 |    ...                                     ...          |
      |   |     |                                       |           |
      +---+-----+---------------------------------------+-----------+

    (the evolution of `.hx` is not recorded, only its initial value $h_k$)

    To summarize (`Z = .state`)
      * `Z[t], hx` -->> `Z[t+1].act`, `hx`, `.actor[t]` (afterstate)
      * `Z[t], Z[t+1].act` -->> `.env[t]` and rest of `Z[t+1]`

    Note that the environment is not interacted with at the N-th step, which
    is indicated by `cloning` the environment's state in prior tables. In
    contrast the actor's would-be reaction $A'_\times$ to $Z'_j$ and $h'_j$ is
    requested and recorded into `.actor[N]` of the p-th fragment, but is NOT
    copied into the (p+1)-st fragment, and instead recomputed anew at its
    zero-th interaction,  $A'_j$. Similarly, we ignore the updated recurrent
    state $h'_{j+1}$ after the N-th step and postpone it until the next
    fragment.

    This wastefulness comes from the possibility of the actor's parameters
    being updated between consecutive trajectory fragments within the same
    batch of environments.
    """
    device = torch.device('cpu') if device is None else device
    # assert isinstance(device, torch.device)
    on_host = device.type == 'cpu'

    # determine what auxiliary data should be recorded
    fragment_has_original_obs = hasattr(fragment.npy, 'original_obs')

    # shorthands for fast access
    #  `out[t]` is buffer's $t, x_t, a_{t-1}, r_t$, and $d_t$
    #  `pyt/npy` is the context's $(t, x_t, a_{t-1}, r_t$, $d_t)$, `hx` is $h_t$
    out, hx = fragment.npy.state, context.pyt.hx
    npy, pyt = context.npy.state, context.pyt.state

    # `original_obs` is the $x_t$ before an automatic reset
    ctx_npy_original_obs = context.npy.original_obs
    fragment_npy_env = fragment.npy.env

    # write the initial recurrent state of the actor to the shared buffer
    pyt_copy_(fragment.pyt.hx, hx)

    # allocate on-device context and recurrent state, if device is not `host`
    pyt_ = pyt
    if not on_host:
        # XXX `suply` always creates a new nested object and copies data in
        #  `pyt` into `pyt_`
        pyt_, hx = suply(torch.Tensor.to, (pyt_, hx), device=device)

    # after each iteration we construct the state `t+1` from `t`:
    #    * `.state[t]` is $(t, x_t, a_{t-1}, r_t, d_t)$
    #    * `.actor[t], hx_` are actor's response to `.state[t]` and $h_t$
    #    * `.env[t]` is env's info from the $s_t, a_t \to s_{t+1}$ step
    #    * `context` is $(t, x_{t+1}, a_t, r_{t+1}, d_{t+1})$,
    #                the original $x_t$ (optional), and the recent env info
    #    * `hx` is $h_{t+1}$, built from `hx_` and actor's resets
    #  and write it to `out[t+1]`. This is OK for Q-learning methods and SARSA
    #  since `out[t]` and `out[t+1]` are contain consecutive $x_t$, $a_{t-1}$,
    #  $r_t$ and $x_{t+1}$, whenever `out.fin[t+1]` ($d_{t+1}$) is `False`
    #  These methods also ignore the Q-value at $x_{t+1}$, if $s_{t+1}$ is
    #  terminal, i.e. $d_{t+1}=\top$, and $x_{t+1}=x_*$.
    n_steps = len(out.fin) - 1
    for t in range(1 + n_steps):  # `fin` is (1 + T) x B
        # copy the state $(t, x_t, a_{t-1}, r_t, d_t)$ from `ctx` to `out[t]`
        suply(setitem, out, npy, index=t)
        # XXX apparently, torch copies host-resident data slower than numpy

        # REACT: $(a_t, h_{t+1})$ are actor's reaction to `.state[t]` and `hx`,
        #  i.e. $(t, x_t, a_{t-1}, r_t, d_t)$, and $h_t$, respectively.
        act_, hx_, info_actor = actor.step(*pyt_, hx=hx, virtual=t >= n_steps)
        # XXX The actor SHOULD respect time and batch dims of the inputs,
        #  except `hx`, but SHOULD NOT change or update anything in-place.

        # `.actor[t] <<-- info`. `fragment.pyt` likely has `is_shared()`,
        #  so it cannot be in the pinned memory.
        pyt_copy_(fragment.pyt.actor, info_actor, at=slice(t, t + 1))
        if t >= n_steps:
            # the T-th REACT interaction within the current trajectory fragment
            #  is used for lookahead only (bootstrap value estimate).
            break

        # the actor may return device-resident tensors, so we copy them here
        pyt_copy_(pyt.act, act_)  # commit $a_t$ into `ctx`

        # STEP + EMIT: `.step` through a batch of envs
        for j, env in enumerate(envs):
            # Only recorded interactions can get stuck: if `.fin = True` when
            #  `t = 0`, then this means the env has been reset elsewhere.
            if sticky and t > 0 and npy.fin[j]:
                # `npy = (s_*, ?, 0, True)` (`.obs` and `.fin` are stuck), but
                # `hx` at `j` may no longer be a genuine recurrent $h_{t j}$.
                npy.rew[j] = 0.
                continue

            # get $(s_t, a_t) \to (s_{t+1}, x_{t+1}, r_{t+1}, d_{t+1})$
            act_ = suply(getitem, npy.act, index=j)  # act might be structured
            obs_, rew_, fin_, info_env = env.step(act_)

            # We cannot pre-unbind the context's arrays, since, unlike torch,
            #  numpy does not slice scalars as views into particular items.
            npy.stepno[j] += 1

            # gym's api guarantees that `info_env` is a dict
            if info_env:
                suply(setitem, fragment_npy_env, info_env, index=(t, j))

            suply(setitem, ctx_npy_original_obs, obs_, index=j)

            # `fin_` indicates if `obs_` is terminal and a reset is needed
            if fin_:
                # XXX DO NOT alter the received reward from the terminal step!
                npy.stepno[j] = 0  # start a new trajectory

                # substitute the terminal observation $x_{t+1}$ with an initial
                #  $x_*$, reset the (unobserved) $s_{t+1}$ to $s_*$ and zero
                #  the actor's recurrent state $h_* \to h_{t+1}$ at env $j$
                obs_ = env.reset()  # s_{t+1} \to s_*, emit x_* from s_*
                hx_ = actor.reset(hx_, j)  # h_{t+1} \to h_* at the j-th env

            # update $x_{t+1}, r_{t+1}, d_{t+1}$ in the j-th env in `ctx`
            suply(setitem, npy.obs, obs_, index=j)
            npy.rew[j] = rew_
            npy.fin[j] = fin_

        # copy back into `hx` in case it is in the pinned memory
        pyt_copy_(hx, hx_)

        # update the device-resident copy of the `ctx` (`hx` is already OK)
        if pyt_ is not pyt:
            pyt_copy_(pyt_, pyt)

        if fragment_has_original_obs:
            suply(setitem, fragment.npy.original_obs, ctx_npy_original_obs,
                  index=t)

    # `hx` may have been spuriously updated on the stationary stuck inputs,
    #  so we ask the actor to reset it one last time for good measure.
    if sticky:
        # If `pyt_` is terminal, then its original `act` and `rew` might have
        #  been overwritten, and `hx` may have been spuriously advanced.
        #  Otherwise, the contents in `pyt_` are from the most recent state in
        #  the rollout, and `hx` is a genuine recurrent state.
        for j in range(len(envs)):
            if npy.fin[j]:
                pyt_copy_(hx, actor.reset(hx, j))

    # write back the most recent recurrent state for the next rollout
    # XXX here `hx` is deliberately not `hx_` to avoid updating the recurrent
    #  state due to the lookahead virtual REACT step
    pyt_copy_(context.pyt.hx, hx)

    return True
Exemplo n.º 13
0
def context(
    *envs,
    pinned=False,
):
    r"""Allocate aliased running state for simple rollout collection.

    Parameters
    ----------
    *envs : gym.Env
        The batch of environments used to initialize the structure and shapes
        of observation and action buffers in the context. Determines the number
        of environments in the context (dim=1).

        WARNING: Each environment is reset AT LEAST once. One environment is
        stepped through EXACTLY once using one action sampled from its space.

    pinned: bool, default=False
        The underlying storage of the newly created tensors resides in pinned
        memory (non-paged) for faster host-device transfers.

    Returns
    -------
    ctx : aliased State
        The running environment-actor context which contains properly time
        synchronised input data for the actor $x_t$, $a_{t-1}$, $r_t$, and
        $d_t$, EXCEPT for the recurrent state $h_t$.

        See docs of `startup` for details of `ctx` aliasing.

    env : aliased nested object with tensor data, shape = (1, batch, ...)
        The extra data received from the batch of environments' `.step` upon
        actually TAKING the actions $a_t$ in them.

    Details
    -------
    This is a version of `startup()`, specialized for actor-less context
    initialization. It returns a simplified context, which contains only
    the State, e.g. the `obs-act-rew-fin` data, and omits everything else,
    namely, environment's auxiliary info, actor's extra info, and its recurrent
    state `hx`.
    """
    env = envs[0]

    # prepare the running context from data some environment, which is reset.
    obs_ = env.reset()
    act_ = env.action_space.sample()
    _, rew_, fin_, info_ = env.step(act_)

    # the buffer for the aux env info data is `1 x n_envs x ...`
    info_ = aliased(torchify(info_, 1, len(envs), pinned=pinned, copy=True))

    # ensure correct data types for `rew_` (to float32) and `fin_` (to bool)
    state_ = State(numpy.int64(0), obs_, act_, numpy.float32(rew_), bool(fin_))

    # torchify and alias, then add unit-time dim to `.pyt` in-place
    state = aliased(torchify(state_, len(envs), pinned=pinned, copy=True))
    suply(torch.Tensor.unsqueeze_, state.pyt, dim=0)

    # Flag the state as having just been reset, meaning that the previous
    #  reward and action are invalid.
    state.npy.fin[:] = True
    state.npy.rew[:] = 0.  # zero `.rew`, leave `.act` undefined
    state.npy.stepno[:] = 0
    for j, env in enumerate(envs):
        suply(setitem, state.npy.obs, env.reset(), index=j)  # x_0 = s_*

    return state, info_
Exemplo n.º 14
0
def startup(
    envs,
    actor,
    buffer,
    *,
    pinned=False,
):
    """Alias the rollout buffer and allocate aliased running context.

    Parameters
    ----------
    envs : list of gym.Env
        The batch of stateful environments to be reset.

        WARNING: Each environment is reset once.

    actor : BaseActorModule
        The actor, the recurrent context of which is to be reset.

    buffer : Fragment
        The reference buffer used to gather the specs and the nested structure
        for the running environment-actor context.

    pinned: bool, default=False
        Determines if the underlying storage of newly created tensors for
        the running env-state context should reside in pinned memory for
        faster host-device transfers.

    Returns
    -------
    ctx : aliased Context
        The running environment-actor context which contains properly time
        synchronised input data for the actor:

            `.state` is $t, x_t, a_{t-1}, r_t, d_t$, and `.hx` is $h_t$.

        Although `.npy` arrays are `n_evs x ...`, while `.pyt` tensors are
        `1 x n_envs x ...`, the data in the context is aliased, i.e. `.npy`
        arrays and `.pyt` tensors reference the SAME underlying data storage,
        which allows changes in one be INSTANTLY reflected in the other.

    fragment : aliased Fragment
        The numpy-torch aliased trajectory fragment buffer. It is created in
        a zero-copy manner, so it also aliases the `buffer` input parameters.

    Details
    -------
    The created `context` has data in torch tensors that reside in shared or
    pinned memory. Only after having been created, the tensors are aliased by
    numpy arrays (see details in `rlplay.engine.utils.shared`) for zero-copy
    data interchange.
    """
    # `fragment` contains nested containers with tensor data possibly residing
    #  in shared memory. The `.actor` [1+T x B x ...] and `.env` [T x B x ...],
    #  the rollout `.state` [1 + T x B x ...]
    #      $(t, x_t, a_{t-1}, r_t, d_t)_{t=0}^T$,
    #  and the initial recurrent state `hx`.
    fragment = aliased(buffer)  # just a zero-copy pyt-npy alias

    # Fetch a single [B x ?] observation (a VIEW into fragment for now)
    npy, pyt = suply(
        getitem,
        (fragment.npy.state, fragment.pyt.state,),
        index=0,
    )
    hx = fragment.pyt.hx

    # Flag the state as having just been reset, meaning that the previous
    #  reward and action are invalid.
    npy.fin[:] = True
    npy.rew[:] = 0.  # zero `.rew`, leave `.act` undefined
    npy.stepno[:] = 0
    for j, env in enumerate(envs):
        suply(setitem, npy.obs, env.reset(), index=j)  # x_0 = s_*

        # reset the actro's initial recurrent state of env `j`
        hx = actor.reset(hx, j)  # h_0[j] = h_*

    # Create `context`, a dedicated container of [B x ?] aliased copies of
    # the data and the current recurrent state `hx` of the actor $h_t$, both
    # possibly residing in torch's pinned memory.
    context = aliased(Context(pyt, hx, pyt.obs), copy=True, pinned=pinned)

    # writable view of `context.pyt.state` with an extra temporal dim
    suply(torch.Tensor.unsqueeze_, context.pyt.state, dim=0)  # in-place!
    # XXX we do this so that the actor may rely on [T x B x ...] data on input

    # `pyt` is used for interacting with the actor, `npy` -- with the fragment
    #  and both are just different interfaces to the same underlying data.
    # anon. torch storage <<--editable unsqueezed view-->> `pyt` torch tensors
    #        ditto        <<--__array__ data aliasing -->> `npy` numpy arrays
    return context, fragment
Exemplo n.º 15
0
def prepare(
    env,
    actor,
    n_steps,
    n_envs,
    *,
    pinned=False,
    shared=False,
    device=None,
):
    """Build a nested object with tensor data for rollout trajectory fragments.

    Parameters
    ----------
    env : gym.Env
        The reference environment used to initialize the structure and shapes
        of observation and action buffer.

        WARNING: The env is reset once and one random action is performed in it.

    actor : Actor
        The actor used to initialize the buffers for the recurrent context and
        auxiliary info for a batch of environments.

        WARNING: `actor.step(..., hx=None)` is called once.

    n_steps : int
        The length of the rollout trajectory fragment to be stored in the
        constructed buffer (dim=0).

    n_envs : int
        The number of environments in a single buffer (dim=1).

    pinned: bool, default=False
        The underlying storage of the newly created tensors resides in pinned
        memory (non-paged) for faster host-device transfers.

        Cannot be used with `shared=True`.

    shared: bool, default=False
        Allocate the underlying storage of new tensors using torch's memory
        interprocess memory sharing logic, which makes it so that all changes
        to the data are reflected between all processes.

        Cannot be used with `pinned=True`.

    device : torch.device, or None
        The device to use when getting the example output from the provided
        actor.

    Returns
    -------
    fragment : nested object with tensor data, shape = (n_steps, n_envs, ...)
        The buffer into which the trajectory fragment will be recorded.

    Details
    -------
    We build buffers with torch in pinned or shared memory, then mirror them
    to numpy.
    """
    # reset, and take a random action in the environment
    obs_ = env.reset()
    act_ = env.action_space.sample()

    # `info_env` is a nested container of numeric scalars or numpy arrays
    # representing auxiliary environment information associated with the
    # transition.
    obs_, rew_, fin_, info_env = env.step(act_)

    # ensure correct data types for `rew_` (to float32) and `fin_` (to bool),
    # while leaving `obs_` and `act_` intact as they are nested containers of
    # numpy arrays or scalars with environment's proper dtypes.
    rew_, fin_, stepno_ = numpy.float32(rew_), bool(fin_), numpy.int64(0)
    # XXX `numpy.float32` inadvertently allows vector rewards

    # the buffer for the aux env info data is `n_steps x n_envs x ...`
    info_env = torchify(info_env, n_steps, n_envs,
                        shared=shared, pinned=pinned)

    # allocate `(1 + n_steps) x n_envs x ...` torch tensor buffers for
    #  the observations, actions, rewards and termination flags
    state = torchify(State(stepno_, obs_, act_, rew_, fin_),
                     1 + n_steps, n_envs, shared=shared, pinned=pinned)

    # make a single pass through the actor with one `1 x n_envs x ...` batch
    pyt = suply(lambda x: x[:1].to(device), state)
    unused_act, hx, info_actor = actor.step(*pyt, hx=None, virtual=True)
    # XXX `act_` is expected to have identical structure to `unused_act`
    # XXX `info_actor` must respect the temporal and batch dims

    # the actor fully specifies its context `hx`, so we torchify it as is
    hx = torchify(suply(torch.Tensor.cpu, hx), shared=shared, pinned=pinned)

    # get one time slice from the actor's info `n_envs x ...` and expand into
    #  an `(1 + n_steps) x n_envs x ...` structured buffer
    info_actor = torchify(suply(lambda x: x[0].cpu(), info_actor),
                          1 + n_steps, shared=shared, pinned=pinned)

    # bundle the buffers into a trajectory fragment
    return Fragment(state=state, actor=info_actor, env=info_env, hx=hx)
Exemplo n.º 16
0
def p_evaluate(ss,
               ctrl,
               factory,
               shared,
               n_envs,
               n_steps,
               *,
               clone=True,
               close=False,
               device=None):
    ctrl.omega.tx.close()
    ctrl.omega.rx.close()

    # always pin the runtime context if the device is 'cuda'
    device = torch.device('cpu') if device is None else device
    pinned, on_host = device.type == 'cuda', device.type == 'cpu'

    # disable mutlithreaded computations in the worker processes
    torch.set_num_threads(1)

    # use the reference actor is not on device
    actor = shared
    if not on_host or clone:
        # make an identical local copy
        actor = deepcopy(shared).to(device)

    # spawn a batch of environments
    env_seeds = ss.spawn(n_envs)  # spawn child seeds

    # prepare local envs and the associated local env-state runtime context
    envs = [factory(seed=seed) for seed in env_seeds]

    # prepare an aliased running context for the specified number of envs
    ctx, info_env = context(*envs, pinned=pinned)
    # `ctx` is $x_*, a_{-1}, r_0, \top, h_0$, where `r_0` is undefined

    # fast access to context's aliases
    npy, pyt = ctx.npy, ctx.pyt

    # Allocate on-device context and recurrent state, if device is not None
    pyt_ = pyt
    if not on_host:
        # XXX this also copies data in `pyt` into `pyt_`
        pyt_ = suply(torch.Tensor.to, pyt_, device=device)

    try:
        # unlike `core.evaluate`, this loop only collects the rewards, since
        #  actor's info may quickly exceed the size limit of `pipe.send`, and
        #  tensors in shared memory are not resizable.
        while True:
            # collect the evaluation data: let the actor init `hx` for us
            rewards, done, t, hx = [], False, 0, None
            while not done and t < n_steps:
                # REACT: $(x_t, a_{t-1}, r_t, d_t, h_t) \to a_t$ and commit $a_t$
                act_, hx, info_ = actor.step(*pyt_, hx=hx, virtual=False)
                pyt_copy_(pyt.act, act_)

                # STEP + EMIT: `.step` through a batch of envs
                for j, env in enumerate(envs):
                    # cease interaction with terminated envs
                    if npy.fin[j] and t > 0:
                        npy.rew[j] = 0.
                        continue

                    # get $(s_t, a_t) \to (s_{t+1}, x_{t+1}, r_{t+1}, d_{t+1})$
                    act_ = suply(getitem, npy.act, index=j)
                    obs_, rew_, fin_, info_env = env.step(act_)
                    npy.stepno[j] += 1
                    if fin_:
                        npy.stepno[j] = 0  # start a new trajectory
                        obs_ = env.reset(
                        )  # s_{t+1} \to s_*, emit x_* from s_*
                        # XXX unlike `core.collect`. do not reset `hx[j]` here

                    # update the j-th env's '$x_{t+1}, r_{t+1}, d_{t+1}$ in `ctx`
                    suply(setitem, npy.obs, obs_, index=j)
                    npy.rew[j], npy.fin[j] = rew_, fin_

                # move the updated `ctx` to its device-resident torch copy
                if pyt_ is not pyt:
                    pyt_copy_(pyt_, pyt)

                # stop only if all environments have been terminated
                done = numpy.all(npy.fin)

                rewards.append(npy.rew.copy())
                t += 1

            try:
                # block until the request and then immediately send the result
                ctrl.alpha.rx.recv()
                ctrl.alpha.tx.send(sum(rewards))

            # if the request pipe (its write endpoint) is closed, then
            #  this means that the parent process wants us to shut down.
            except EOFError:
                break

            # update parameters from the shared reference actor
            if actor is not shared:
                actor.load_state_dict(shared.state_dict(), strict=True)

    except Exception:
        from traceback import format_exc
        ctrl.error.put(format_exc())
        sys.exit(1)

    finally:
        # let the parent know that something went wrong
        ctrl.alpha.tx.close()
        ctrl.alpha.rx.close()

        # close the environments in states
        if close:
            for env in envs:
                env.close()
Exemplo n.º 17
0
def episode(envs, actor, *, device=None):
    """Episodic data generator.

    Parameters
    ----------
    envs : list of gym.Env
        The stateful evaluation environments to step through.

    actor : BaseActorModule
        The actor, which steps through the batch of environments.

    device : torch.device, default=None
        The device onto which to put the input $x_t$ `obs`, $a_{t-1}$ `act`,
        $r_t$ `rew`, $d_t$ `fin`, and $h_t$ `hx` for the actor when stepping
        through the test environments.

    Yields
    ------
    state : State, shape = (1, n_envs, ...)
        The obs-act-rew-fin state with tensor data and the same semantics as
        described in `State`.

    hx : nested object with tensor data
        The present recurrent state of the actor $h_t$, which conditions
        its response along with the `state` input.

    actor : nested object with tensor data, shape = (1, n_envs, ...)
        The extra afterstate information returned by the actor's `.step`.

    next : State, shape = (1, n_envs, ...)
        The next obs-act-rew-fin state with the original true observation $x_t$
        before any reset.

    env : nested object with tensor data, shape = (1, n_envs, ...)
        The extra data received from the batch of environments' `.step` upon
        actually TAKING the actions $a_t$ in them.
    """
    # always pin the runtime context if the device is 'cuda'
    device = torch.device('cpu') if device is None else device
    pinned, on_host = device.type == 'cuda', device.type == 'cpu'

    # prepare a running context for the specified number of envs
    ctx, info_env = context(*envs, pinned=pinned)
    # `ctx` is $x_*, a_{-1}, r_0, \top, h_0$, where `r_0` is undefined

    # fast access to context's aliases
    npy, pyt, info_env_pyt = ctx.npy, ctx.pyt, info_env.pyt

    # Allocate an on-device context and recurrent state, if not on 'host'
    pyt_, info_env_pyt_ = pyt, info_env_pyt
    if not on_host:
        pyt_, info_env_pyt_ = suply(torch.Tensor.to, (pyt_, info_env_pyt),
                                    device=device)

    # let the actor init `hx` for us: `torch.nn.LSTM` performs two steps: inits
    #  `hx` if it is `None` and then updates it. We undo the second step here.
    _, hx, _ = actor.step(*pyt_, hx=None, virtual=True)
    hx = actor.reset(hx, at=slice(None))

    # collect the evaluation data
    current_ = suply(torch.clone, pyt_)
    while True:
        # REACT: $(t, x_t, a_{t-1}, r_t, d_t, h_t) \to a_t$ and commit $a_t$
        act_, hx_, info_actor = actor.step(*current_, hx=hx, virtual=False)

        # STEP + EMIT: `.step` through a batch of envs
        pyt_copy_(pyt.act, act_)
        for j, env in enumerate(envs):
            # get $(s_t, a_t) \to (s_{t+1}, x_{t+1}, r_{t+1}, d_{t+1})$
            act_ = suply(getitem, npy.act, index=j)
            obs_, rew_, fin_, info_ = env.step(act_)

            npy.stepno[j] += 1
            suply(setitem, info_env.npy, info_, index=j)

            # update the j-th env's '$x_{t+1}, r_{t+1}, d_{t+1}$ in `ctx`
            suply(setitem, npy.obs, obs_, index=j)
            npy.rew[j], npy.fin[j] = rew_, fin_

        # update device-resident copies of `ctx` and `info_env`
        if not on_host:
            pyt_copy_((pyt_, info_env_pyt_), (pyt, info_env_pyt))

        # response: t, state[t], h_t, actor[t], state[t+1], env[t+1]
        yield current_, hx, info_actor, pyt_, info_env_pyt_

        # reset terminated envs (see `context(...)`)
        for j, env in enumerate(envs):
            if npy.fin[j]:
                hx_ = actor.reset(hx_, j)  # h_{t+1} \to h_* at the j-th env

                # s_{t+1} \to s_*, emit x_* from s_*, reset the rest
                suply(setitem, npy.obs, env.reset(), index=j)
                npy.stepno[j], npy.rew[j] = 0, 0.

        # move the final `ctx` to its device-resident copy
        if pyt_ is not pyt:
            pyt_copy_(pyt_, pyt)

        # overwrite the current state for the next iteration
        pyt_copy_(current_, pyt_)
        hx = hx_
Exemplo n.º 18
0
def rollout(
    factory,
    actor,
    n_steps,
    # number of actors and environments each interacts with
    n_actors=8,
    n_per_actor=2,
    # the size of the rollout buffer pool (must have spare buffers)
    n_buffers=16,
    n_per_batch=4,
    *, sticky=False, pinned=False, close=False, clone=True, device=None,
    start_method=None, timeout=10, affinity=None, entropy=None
):
    check_signature(factory, seed=None)

    # the device to put the batches onto
    device = torch.device('cpu') if device is None else device

    # the device, on which to run the worker subprocess
    if not isinstance(affinity, (tuple, list)):
        affinity = (affinity,) * n_actors
    assert len(affinity) == n_actors

    # get the correct multiprocessing context (torch-friendly)
    mp = get_context(start_method)

    # initialize a reference buffer and make its shared copies
    env = factory(seed=None)  # XXX seed=None here since used only once

    # create a host-resident copy of the module in shared memory, which
    #  serves as a vessel for updating the actors in workers
    shared = deepcopy(actor).cpu().share_memory()

    # a single one-element-batch forward pass through the copy
    ref = prepare(env, shared, n_steps, n_per_actor, pinned=False, device=None)

    # some environments don't like being closed or deleted, e.g. `nle`
    if close:
        env.close()

    # pre-allocate a buffer for the collated batch
    #  (non-paged physical memory for faster host-device transfers)
    # XXX `torch.cat` and `.stack` always allocate a new tensor
    stacked = suply(torch.Tensor.to,
                    tuply(torch.stack, *(ref,) * n_per_batch, dim=1),
                    device=device)

    # create slice views along dim=1 and a flattened view into the stacked data
    batch = suply(torch.flatten, stacked, start_dim=1, end_dim=2)
    batch_slice = tuple([suply(getitem, stacked, index=(slice(None), j))
                         for j in range(n_per_batch)])

    # create buffers for trajectory fragments in the shared memory
    # (shared=True always makes a copy).
    # XXX torch tensors have much simpler pickling/unpickling when sharing
    buffers = torchify((ref,) * n_buffers, shared=True)

    del ref

    # setup buffer index queues, and create a state-dict update lock, which
    #  makes `actor ->> shared` updates atomic
    # XXX randomly partial updates might inject beneficial stochasticity
    ctrl = Control(mp.Lock(), mp.Queue(), mp.Queue())
    for index, _ in enumerate(buffers):
        ctrl.empty.put(index)

    # prepare the a seed sequence for each worker
    ss = SeedSequence(entropy=entropy)
    worker_ss = ss.spawn(n_actors)

    # spawn worker subprocesses (nprocs is world size)
    p_workers = start_processes(
        p_stepper, start_method=mp._name, daemon=False, nprocs=n_actors,
        # collectors' device may be other than the main device
        join=False, args=(
            n_actors, worker_ss, ctrl, CloudpickleSpawner(factory), buffers,
            shared, clone, sticky, close, affinity
        ))

    # fetch for ready trajectory fragments and collate them into a batch
    try:
        while True:
            for j in range(n_per_batch):
                ix = ctrl.ready.get(timeout=timeout)

                suply(torch.Tensor.copy_, batch_slice[j],
                      buffers[ix], non_blocking=True)  # XXX has fx if pinned

                ctrl.empty.put(ix)

            yield batch

            # ensure consistent update of the shared module
            # XXX tau-moving average update?
            with ctrl.reflock:
                shared.load_state_dict(actor.state_dict(), strict=True)

    except QueueEmpty:
        pass

    finally:
        # shutdown workers: we don't care about `ready` indices
        # * closing the empty queue doesn't register
        # * can only put None-s
        for _ in range(n_actors):
            ctrl.empty.put(None)

        # `start_processes` loops on `.join` until it returns True or raises
        while not p_workers.join():
            pass

        ctrl.empty.close()
        ctrl.ready.close()