예제 #1
0
def main():
    g = Grid(4, 4)

    terminals = [{
        "x": 3,
        "y": 0,
        "reward": 1
    }, {
        "x": 1,
        "y": 3,
        "reward": 1
    }, {
        "x": 2,
        "y": 3,
        "reward": -10
    }, {
        "x": 3,
        "y": 3,
        "reward": 10
    }]
    blocks = [{"x": 1, "y": 1}]

    g.init_world(terminals, blocks)

    np.random.seed(62)

    mdp.value_iteration(g, -0.02, 0.8, 0.8)

    mdp.policy_iteration(g, -0.02, 0.8, 0.8)

    mdp.q_function(g, "s6", -0.02, 0.8, 0.1, 0.1, 0.8, 1000000)
예제 #2
0
    def _policy_iteration_slow(self):
        old_policy = dict(self.mdp.policy)
        for i in range(100):
            policy_iteration(old_policy,self.mdp, num_iter=1)
            self.gridworldwindow.update_grid(self.mdp.values, self.mdp.policy)
            self.gridworldwindow.window.update()
            time.sleep(0.25)
            self.gridworldwindow.window.update()

            new_policy = dict(self.mdp.policy)
            if policy_converged(new_policy, old_policy):
                break
            
            old_policy = new_policy
        self.gridworldwindow.show_dialog('Policy Iteration has converged in {} steps!'.format(i+1))
예제 #3
0
 def _policy_iteration_1_step(self):
     policy, values = policy_iteration(self.mdp.policy,
                                       self.mdp,
                                       num_iter=1)
     self.gridworld.update_grid(values, policy)
     self.mdp.update_values(values)
     self.mdp.update_policy(policy)
예제 #4
0
 def solve(self,
           episodes=200,
           iterations=200,
           reset=True,
           seed=False,
           gamma=0.95):
     mdp = EnvMDP(self.env, gamma=gamma)
     self.policy = policy_iteration(mdp)
     self.U = value_iteration(mdp, epsilon=0.000000000001)
def get_reachable(m):
    reachable_states = [m.init]
    new_rs = []
    policy = mdp.policy_iteration(m)
    while True:
        for state in reachable_states:
            if state in policy:
                for (result_state, prob) in m.T(state, policy[state]).iteritems():
                    new_rs = new_rs + [result_state, state]
                #new_rs = new_rs + [policy[state]]
        new_rs = list(set(new_rs))
        if new_rs == reachable_states:
            print "breaking: " + str(new_rs)
            #return new_rs
            break
        else:
            reachable_states = new_rs
    return True # Let's make sure the whole loop works first
예제 #6
0
 def _policy_iteration_100_steps(self):
     policy_iteration(self.mdp, num_iter=100)
     self.gridworld.update_grid(self.mdp.values, self.mdp.policy)
    def actions(self, state):
        """Set of actions that can be performed in this state.  By default, a
        fixed list of actions, except for terminal states. Override this
        method if you need to specialize by state."""
        if "dead" in (state.player.classify_hp(), state.opponent.classify_hp()):
            return [None]
        else:
            return state.player.moveset

    def T(self, state, action):
        if action == None:
            return [(0.0, state)]
        else:
            p = 1.0 / len(state.opponent.moveset)
            your_action = action(self.moveset_values[action])(state)
            v = [
                (p, counter_attack(self.moveset_values[counter_attack])(your_action))
                for counter_attack in state.opponent.moveset
            ]
            return v


initial_state = State(
    Entity(5, 1, 1, [attack_opponent, weaken_defense, boost_attack]),
    Entity(5, 1, 1, [attack_opponent, weaken_attack, boost_defense]),
)
initial_moveset_values = {boost_attack: 1, boost_defense: 1, weaken_attack: 1, weaken_defense: 1, attack_opponent: 1}
m = BattleSimulation(initial_moveset_values, initial_state)
solution = mdp.policy_iteration(m)
pprint.PrettyPrinter(indent=4).pprint(solution)
            for i_action in range(num_actions):
                pi_dict[i_state][i_action] = default_action

    [S, A, P, R, gamma, pi] = mdp.create_MDP(state_space, action_space, P_dict,
                                             R_dict, gamma, pi_dict)

    for i_state in range(num_states):
        #        print(i_state)
        for i_action in range(num_actions):
            P[i_action, i_state, :] = deepcopy(
                P[i_action, i_state, :] /
                (sum(P[i_action, i_state, :]) + 10**(-10)))
#            print(P[i_action,i_state,:])

    a = 1
    best_action, vk = mdp.policy_iteration(S, A, P, R, gamma, pi)
    print(best_action)

    avg_best_pol = np.zeros(t.shape[0]) - 1

    for i_t in range(t.shape[0]):

        sum_t = 0
        t_val = t[i_t]
        counter_t = 0
        for i_state in range(num_states):

            if near(state_space[i_state][0], t_val):

                sum_t = sum_t + action_space[int(best_action[i_state])]
                counter_t = counter_t + 1
    return q_table, rewards


if __name__ == '__main__':
    mdp = PresentationEnvironment()

    # Hyperparameters
    episodes = 400
    steps_per_episode = 10
    alpha = 0.95            # learning rate
    epsilon = 1.            # exploration-exploitation rate
    gamma = 0.3             # discount rate

    q_table, rewards_over_time = q_learning(mdp, episodes, steps_per_episode, alpha, epsilon, gamma)

    optimal_policy = policy_iteration(mdp)
    q_policy = [np.argmax(q_table[s, :]) for s in range(mdp.number_of_states)]

    optimal_path, optimal_rewards = simulate(mdp, optimal_policy)
    q_path, q_rewards = simulate(mdp, q_policy)

    # Plot Rewards over episodes
    fig = plt.figure()

    ax = fig.add_subplot(111)
    ax.plot(rewards_over_time, '-b', label='Q-learning')
    ax.plot([optimal_rewards] * len(rewards_over_time), '--r', label='Optimal solution')
    ax.set_xlabel('Episodes')
    ax.set_ylabel('Cumulative Rewards')

    plt.figtext(0.96, 0.02, 'alpha: {}; gamma: {}'.format(alpha, gamma), ha="right", fontsize=8)
예제 #10
0
    table = [header] + table
  table = [[(numfmt % x if isnumber(x) else x) for x in row]
           for row in table]
  maxlen = lambda seq: max(map(len, seq))
  sizes = map(maxlen, zip(*[map(str, row) for row in table]))
  for row in table:
    print sep.join(getattr(str(x), j)(size)
                   for (j, size, x) in zip(justs, sizes, row))

prize = 1
trap = -1
neg = -0.4

mdp1 = GridMDP([[neg, trap, prize],
                [neg, None, neg],
                [neg,  neg, neg]],
                terminals=[(1, 2), (2, 2)],
                error=.8)

print "GRID"
print
print "Value iteration"
pi = best_policy(mdp1, value_iteration(mdp1, .01))
print_table(mdp1.to_arrows(pi))

print "Policy iteration"
print_table(mdp1.to_arrows(policy_iteration(mdp1)))

print "Q Learning"
pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000))
print_table(mdp1.to_arrows(pi))
예제 #11
0
 def _policy_iteration_100_steps(self):
     policy, values = policy_iteration(self.mdp.policy, self.mdp, num_iter=100)
     self.gridworldwindow.update_grid(values, policy)
     self.mdp.update_values(values)
     self.mdp.update_policies(policy)