Exemplo n.º 1
0
    def executeStrategy(self,
                        pi,
                        nSteps,
                        initialState,
                        callback=None,
                        id=0,
                        verbose=False):
        # create arrays to store the trajectories and initialize first state
        states = np.zeros(nSteps + 1, dtype=int)
        states[0] = initialState
        actions = np.zeros(nSteps, dtype=int)
        rewards = np.zeros(nSteps)
        if callback:
            cb_results = np.empty(nSteps + 1, dtype=object)
            cb_results[0] = callback.callback(self, pi)

        # generate the trajectory
        for t in range(nSteps):
            # print output
            if verbose:
                print(f"trajectory: {id}   time step: {t}/{nSteps}")

            # extract current state and (randomly) select action according to the specified policy
            s1 = states[t]
            a = pi.selectActions(s1)

            # evaluate reward and sample next state
            r = self.earnRewards(s1, a)
            # Check type of transition model
            if isinstance(self.T, Simulator):
                s2 = self.T.simulate(curStates=s1, curActions=a)
            else:
                s2 = sampleDiscrete(self.T[:, s1, a])

            # update strategy
            pi.update(s1, a, r, s2)

            # run callback function
            if callback:
                cb_results[t + 1] = callback.callback(self, pi)

            # store values in arrays
            # Check type of transition model
            if isinstance(self.T, Simulator):
                states[t + 1] = self.T.simulate(curStates=s1, curActions=a)
            else:
                states[t + 1] = sampleDiscrete(self.T[:, s1, a])
            actions[t] = a
            rewards[t] = r

        return states, actions, rewards, cb_results
    def selectActions(self, states):
        """
        Selects actions at the specified states.
        :param states: array of arbitrary length specifying the query states
        :return: array of the same size as the query array containing the selected actions
        """
        states = np.array(states)

        if self.isStochastic:
            # stochastic policy
            if states.ndim == 0:
                return sampleDiscrete(self.decisionTable[states, :], axis=0)
            return sampleDiscrete(self.decisionTable[states, :], axis=1)

        else:
            # deterministic policy
            return self.decisionTable[states]
Exemplo n.º 3
0
    def gibbs_step(goals):
        # get Gibbs sample from the PG conditional distribution
        goal_distributions = fitGoalDists(goals)

        # combine with "prior" (i.e. the action likelihoods) in the log domain and convert back to linear domain
        weights = softmax(np.log(goal_distributions) + L, axis=1)

        # return a sample from the resulting conditional distribution
        return sampleDiscrete(weights, axis=1)
Exemplo n.º 4
0
def selectActions(decisionTable, states):
    """
    Selects actions at the specified states for a given (deterministic or stochastic) policy.

    :param decisionTable: Tabular representation of the policy. Can be either
        - [S] array containing actions for all states of the MDP
        - [S x A] array, where each row represents a discrete distribution over actions
    :param states: array of arbitrary length specifying the query states
    :return: array of the same size as the query array containing the selected actions
    """
    # convert input
    decisionTable = np.asarray(decisionTable)
    states = np.atleast_1d(states)

    if decisionTable.ndim == 1:
        # deterministic policy
        return decisionTable[states]
    else:
        # stochastic policy
        return sampleDiscrete(decisionTable[states, :], axis=1)
Exemplo n.º 5
0
    def sampleTrajectories(self,
                           nTrajs,
                           nSteps,
                           pi=None,
                           initialStates=None,
                           initialDistribution=None,
                           dropLastState=False,
                           computeRewards=False,
                           parallel=True,
                           callback=None,
                           verbose=False):
        """
        Samples a specified number of trajectories of a given length from the MDP.

        :param pi: Parameter containing the policy. Can be either
            - [S] array containing actions for all states of the MDP
            - [S x A] array, where each row represents a discrete distribution over actions
            - policy object
        :param nTrajs: Integer specifying the number of trajectories to be generated
        :param nSteps: Integer specifying the number of state transitions per trajectory (see return values)
        :param initialStates: Parameter to specify fixed starting states for the trajectories.
            - 'None': the starting states are sampled from the specified initial distribution
            - integer: all trajectories start from the specified state
            - array of length nTrajs: each trajectory starts from its own specified state
        :param initialDistribution: Initial distribution to generate the starting states
            - 'None' and also initialStates is 'None': a uniform distribution over states is assumed
            - 'None' but initialStates is not 'None': the given initial states are used
            - [S] array: treated as distribution over states
        :param dropLastState: Boolean indicating if last state of each trajectory should be dropped
        :param computeRewards: Boolean indicating if rewards should be computed along the trajectories (Only for
        stationary policies. For policy objects, the rewards are anyway computed for the policy update.)
        :param parallel: Boolean to enable parallel generation of trajectories for policy objects in separate
            threads. (Stationary policies are processed in parallel by default, via vectorization.)
        :param callback: optional callback object whose callback method is executed after each policy update
        :param verbose: boolean to print progress
        :return: dictionary with keys 'states', 'actions' (optionally: 'rewards', 'callbacks')
            states: [nTrajs, L] array containing the generated state sequences.
                L = nSteps + 1 if dropLastState == False
                L = nSteps if dropLast State == True
            actions: [nTrajs, nSteps] array containing the generated action sequences.
            rewards: [nTrajs, nSteps] array containing the collected rewards
            callbacks: [nTrajs, L] array containing the callback results (L: see states)
        """
        # by default, use policy stored in MDP
        if pi is None:
            pi = self.pi

        # assert that either the initial states or the initial distribution is specified, but not both
        try:
            assert (initialStates is None) or (initialDistribution is None)
        except AssertionError:
            raise ValueError(
                "either 'initialStates' or 'initialDistribution' must be 'None'"
            )

        # if initial states are not specified, use the specified initial distribution or a uniform distribution
        if initialStates is None:
            if initialDistribution is None:
                initialDistribution = np.full(self.nStates, 1 / self.nStates)
            initialStates = sampleDiscrete(initialDistribution, size=nTrajs)

        # check type of policy
        if isinstance(pi, Policy):
            if self.R is None:
                raise ValueError(
                    'a strategy policy requires a valid reward model')
            baseStrategy = pi
            isStrategy = True
        else:
            isStrategy = False

        # create arrays to store the trajectories and initialize first states
        states = np.zeros((nTrajs, nSteps + 1), dtype=int)
        states[:, 0] = initialStates
        actions = np.zeros((nTrajs, nSteps), dtype=int)
        if computeRewards:
            rewards = np.zeros((nTrajs, nSteps))
        if callback:
            cb_results = np.empty((nTrajs, nSteps + 1), dtype=object)

        # ----- trajectory generation -----#

        # stationary policy
        if not isStrategy:
            for t in range(nSteps):
                # print output
                if verbose:
                    print(f"time step: {t}/{nSteps}")

                # extract current state and (randomly) select action according to the specified policy
                s = states[:, t]
                a = self.selectActions(s, pi)

                # collect rewards
                if computeRewards:
                    rewards[:, t] = self.earnRewards(s, a)

                # sample next states and store new time slice in arrays
                # Check type of transition model
                if isinstance(self.T, Simulator):
                    states[:, t + 1] = self.T.simulate(curStates=s,
                                                       curActions=a)
                else:
                    states[:, t + 1] = sampleDiscrete(self.T[:, s, a])
                actions[:, t] = a

        # strategy policy
        else:
            # parallel execution via pool
            if parallel:

                def f(initState, id):
                    np.random.seed(id)
                    return self.executeStrategy(pi, nSteps, initState,
                                                callback, id, verbose)

                result = parallelMC(f, [[i] for i in states[:, 0]])
                states, actions, rewards, cb_results = map(
                    np.vstack, zip(*result))
            # serial execution
            else:
                for traj in range(nTrajs):
                    pi = deepcopy(baseStrategy)
                    states[traj, :], actions[traj, :], rewards[traj, :], cb_results[traj, :] = \
                        self.executeStrategy(pi, nSteps, states[traj, 0], callback, traj, verbose)

        # ----- end of trajectory generation -----#

        # if desired, drop last states
        if dropLastState:
            states = states[:, :-1]
            if callback:
                cb_results = cb_results[:, :-1]

        # construct output dictionary
        result = {'states': states, 'actions': actions}
        if computeRewards:
            result['rewards'] = rewards
        if callback:
            result['callbacks'] = cb_results

        return result