示例#1
0
    def solve(self, env, policy):
        # solve environment with respect to policy
        actions, energy = [], 0

        # Set state to starting state of environment.
        state, prevState = env.getStartingState(), None
        isTerminalState = False
        while not isTerminalState:
            # Policy has best actions for given state.
            act = policy.get(state)
            if act is None:
                act = random.choice(env.getActions(state))
                # Execute selected action in current state.
            state, reward, isTerminalState = env.do(state, act)

            actions.append(act)
            energy += reward

            if energy < -1000:
                break
            # We get a list of actions that were executed and sum of rewards that were given when agent entered certain state.
        return actions, energy
示例#2
0
    def solve(self, env, policy):
        # solve environment with respect to policy
        actions, energy = [], 0

        # Set state to starting state of environment.
        state, prevState = env.getStartingState(), None
        isTerminalState = False
        while not isTerminalState:
            # Policy has best actions for given state.
            act = policy.get(state)
            if act is None:
                act = random.choice(env.getActions(state))
                # Execute selected action in current state.
            state, reward, isTerminalState = env.do(state, act)

            actions.append(act)
            energy += reward

            if energy < -1000:
                break
                # We get a list of actions that were executed and sum of rewards that were given when agent entered certain state.
        return actions, energy
示例#3
0
def adp_optimistic_rewards(env,
                           transs={},
                           utils={},
                           freqs={},
                           policy={},
                           rewards={},
                           **kwargs):
    """
	Active ADP (adaptive dynamic programming)

	@param env: Environment
	@param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero.
	@param utils: Utilities table
	@param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero.
	@param R_plus: An optimistic estimate of the best possible reward obtainable in any state.
	@param N_e: Limit of how many number of optimistic reward is given before true utility.
	@param alpha: Step size function
	@param maxItr: Maximum iterations
	"""
    R_plus = kwargs.get('R_plus', 5)
    N_e = kwargs.get('N_e', 12)
    alpha = kwargs.get('alpha', _alpha)
    maxItr = kwargs.get('maxItr', 10)

    itr = 0
    isTerminal = False
    state = env.getStartingState()
    rewardSum = 0
    lastReward = False

    # Get possible actions with respect to current state.
    actions = env.getActions(state)
    _policy_iteration(transs,
                      utils,
                      policy,
                      rewards,
                      R_plus=R_plus,
                      N_e=N_e,
                      th=alpha(itr))
    bestAction = policy.get(state, random.choice(actions))

    while not isTerminal:  # while not terminal
        if bestAction is None:
            # If it is the first iteration or exploration event
            # then randomly choose an action. Taking a random action in 1/t instances.
            bestAction = random.choice(actions)

        # do the action with the best policy
        # or do some random exploration
        newState, reward, isTerminal = env.do(state, bestAction)
        rewards[newState] = reward
        rewardSum += reward
        lastReward = reward >= 0

        # Set to zero if newState does not exist yet. For new state?
        freqs.setdefault(newState, 0)
        freqs[newState] += 1

        # update transition table. The first one returns dictionary of actions for specific state and the
        # second one a dictionary of possible states from specific action (best action).
        transs.setdefault(state, {}).setdefault(bestAction,
                                                {}).setdefault(newState, 0)
        transs[state][bestAction][newState] += 1

        # We need to get actions on new state.
        actions = env.getActions(newState)
        for ac in actions:
            transs.setdefault(newState, {}).setdefault(ac, {})
        _policy_iteration(transs,
                          utils,
                          policy,
                          rewards,
                          R_plus=R_plus,
                          N_e=N_e,
                          th=alpha(itr))

        #rewardEstimate, bestAction = max(_getEstimatesOptimistic(transs, utils, state, R_plus, N_e, actions))
        bestAction = policy.get(newState, random.choice(actions))
        state = newState

        itr += 1
        if itr >= maxItr:
            break
    return itr, rewardSum, lastReward
示例#4
0
def adp_random_exploration(env,
                           transs={},
                           utils={},
                           freqs={},
                           policy={},
                           rewards={},
                           **kwargs):
    """
	Active ADP (adaptive dynamic programming) learning
	algorithm which returns the best policy for a given
	environment env and experience dictionary exp

	The experience dictionary exp can be empty if 
	the agent has no experience with the environment
	but can also be full with values from
	previous trials

	The algorithm returns the number of iterations
	needed to reach a terminal state

	For reference look in page 834.

	@param env: Environment
	@param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero.
	@param utils: Utilities table
	@param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero.
	@param t: A parameter for choosing best action or random action.
	@param tStep: A step to increment parameter t.
	@param alpha: Step size function
	@param maxItr: Maximum iterations
	"""

    tStep = kwargs.get('tStep', 0.01)
    alpha = kwargs.get('alpha', _alpha)
    maxItr = kwargs.get('maxItr', 50)
    tFac = kwargs.get('tFac', 1.)
    t = kwargs.get('currItrs', 0) / 5 if kwargs.get('remember', False) else 0
    minRnd = kwargs.get('minRnd', 0.0)

    itr = 0
    isTerminal = False
    state = env.getStartingState()
    rewardSum = 0

    lastReward = False

    # Get possible actions with respect to current state.
    actions = env.getActions(state)
    _policy_iteration(transs, utils, policy, rewards, th=alpha(itr))
    bestAction = policy.get(state, random.choice(actions))

    while not isTerminal:  # while not terminal
        if random.random() < max(minRnd, 1. / (tFac *
                                               (t + 1))) or bestAction is None:
            # If it is the first iteration or exploration event
            # then randomly choose an action. Taking a random action in 1/t instances.
            bestAction = random.choice(actions)

        # do the action with the best policy
        # or do some random exploration
        newState, reward, isTerminal = env.do(state, bestAction)

        lastReward = reward >= 0

        rewards[newState] = reward
        rewardSum += reward

        # Set to zero if newState does not exist yet. For new state?
        freqs.setdefault(newState, 0)
        freqs[newState] += 1

        # update transition table. The first one returns dictionary of actions for specific state and the
        # second one a dictionary of possible states from specific action (best action).
        transs.setdefault(state, {}).setdefault(bestAction,
                                                {}).setdefault(newState, 0)
        transs[state][bestAction][newState] += 1

        actions = env.getActions(newState)
        for ac in actions:
            transs.setdefault(newState, {}).setdefault(ac, {})
        _policy_iteration(transs, utils, policy, rewards, th=alpha(itr))

        bestAction = policy.get(newState, random.choice(actions))

        # Is this part from the book:
        # Having obtained a utility function U that is optimal for the learned model,
        # the agent can extract an optimal action by one-step look-ahead to maximize
        # the expected utility; alternatively, if it uses policy iteration, the
        # optimal policy is already available, so it should simply execute the
        # action the optimal policy recommends. Or should it?

        state = newState

        # A GLIE scheme must also eventually become greedy, so that the agent's actions
        # become optimal with respect to the learned (and hence the true) model. That is
        # why the parameter t needs to be incremented.
        t, itr = t + tStep, itr + 1
        if itr >= maxItr:
            break
    return itr, rewardSum, lastReward
示例#5
0
def adp_random_exploration(env, transs={}, utils={}, freqs={}, policy={}, rewards={}, **kwargs):
    """
	Active ADP (adaptive dynamic programming) learning
	algorithm which returns the best policy for a given
	environment env and experience dictionary exp

	The experience dictionary exp can be empty if 
	the agent has no experience with the environment
	but can also be full with values from
	previous trials

	The algorithm returns the number of iterations
	needed to reach a terminal state

	For reference look in page 834.

	@param env: Environment
	@param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero.
	@param utils: Utilities table
	@param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero.
	@param t: A parameter for choosing best action or random action.
	@param tStep: A step to increment parameter t.
	@param alpha: Step size function
	@param maxItr: Maximum iterations
	"""

    tStep = kwargs.get("tStep", 0.01)
    alpha = kwargs.get("alpha", _alpha)
    maxItr = kwargs.get("maxItr", 50)
    tFac = kwargs.get("tFac", 1.0)
    t = kwargs.get("currItrs", 0) / 5 if kwargs.get("remember", False) else 0
    minRnd = kwargs.get("minRnd", 0.0)

    itr = 0
    isTerminal = False
    state = env.getStartingState()
    rewardSum = 0

    lastReward = False

    # Get possible actions with respect to current state.
    actions = env.getActions(state)
    _policy_iteration(transs, utils, policy, rewards, th=alpha(itr))
    bestAction = policy.get(state, random.choice(actions))

    while not isTerminal:  # while not terminal
        if random.random() < max(minRnd, 1.0 / (tFac * (t + 1))) or bestAction is None:
            # If it is the first iteration or exploration event
            # then randomly choose an action. Taking a random action in 1/t instances.
            bestAction = random.choice(actions)

            # do the action with the best policy
            # or do some random exploration
        newState, reward, isTerminal = env.do(state, bestAction)

        lastReward = reward >= 0

        rewards[newState] = reward
        rewardSum += reward

        # Set to zero if newState does not exist yet. For new state?
        freqs.setdefault(newState, 0)
        freqs[newState] += 1

        # update transition table. The first one returns dictionary of actions for specific state and the
        # second one a dictionary of possible states from specific action (best action).
        transs.setdefault(state, {}).setdefault(bestAction, {}).setdefault(newState, 0)
        transs[state][bestAction][newState] += 1

        actions = env.getActions(newState)
        for ac in actions:
            transs.setdefault(newState, {}).setdefault(ac, {})
        _policy_iteration(transs, utils, policy, rewards, th=alpha(itr))

        bestAction = policy.get(newState, random.choice(actions))

        # Is this part from the book:
        # Having obtained a utility function U that is optimal for the learned model,
        # the agent can extract an optimal action by one-step look-ahead to maximize
        # the expected utility; alternatively, if it uses policy iteration, the
        # optimal policy is already available, so it should simply execute the
        # action the optimal policy recommends. Or should it?

        state = newState

        # A GLIE scheme must also eventually become greedy, so that the agent's actions
        # become optimal with respect to the learned (and hence the true) model. That is
        # why the parameter t needs to be incremented.
        t, itr = t + tStep, itr + 1
        if itr >= maxItr:
            break
    return itr, rewardSum, lastReward
示例#6
0
def adp_optimistic_rewards(env, transs={}, utils={}, freqs={}, policy={}, rewards={}, **kwargs):
    """
	Active ADP (adaptive dynamic programming)

	@param env: Environment
	@param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero.
	@param utils: Utilities table
	@param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero.
	@param R_plus: An optimistic estimate of the best possible reward obtainable in any state.
	@param N_e: Limit of how many number of optimistic reward is given before true utility.
	@param alpha: Step size function
	@param maxItr: Maximum iterations
	"""
    R_plus = kwargs.get("R_plus", 5)
    N_e = kwargs.get("N_e", 12)
    alpha = kwargs.get("alpha", _alpha)
    maxItr = kwargs.get("maxItr", 10)

    itr = 0
    isTerminal = False
    state = env.getStartingState()
    rewardSum = 0
    lastReward = False

    # Get possible actions with respect to current state.
    actions = env.getActions(state)
    _policy_iteration(transs, utils, policy, rewards, R_plus=R_plus, N_e=N_e, th=alpha(itr))
    bestAction = policy.get(state, random.choice(actions))

    while not isTerminal:  # while not terminal
        if bestAction is None:
            # If it is the first iteration or exploration event
            # then randomly choose an action. Taking a random action in 1/t instances.
            bestAction = random.choice(actions)

            # do the action with the best policy
            # or do some random exploration
        newState, reward, isTerminal = env.do(state, bestAction)
        rewards[newState] = reward
        rewardSum += reward
        lastReward = reward >= 0

        # Set to zero if newState does not exist yet. For new state?
        freqs.setdefault(newState, 0)
        freqs[newState] += 1

        # update transition table. The first one returns dictionary of actions for specific state and the
        # second one a dictionary of possible states from specific action (best action).
        transs.setdefault(state, {}).setdefault(bestAction, {}).setdefault(newState, 0)
        transs[state][bestAction][newState] += 1

        # We need to get actions on new state.
        actions = env.getActions(newState)
        for ac in actions:
            transs.setdefault(newState, {}).setdefault(ac, {})
        _policy_iteration(transs, utils, policy, rewards, R_plus=R_plus, N_e=N_e, th=alpha(itr))

        # rewardEstimate, bestAction = max(_getEstimatesOptimistic(transs, utils, state, R_plus, N_e, actions))
        bestAction = policy.get(newState, random.choice(actions))
        state = newState

        itr += 1
        if itr >= maxItr:
            break
    return itr, rewardSum, lastReward