示例#1
0
class TestQLearningAgent(unittest.TestCase):
    '''
    Tests the Q-learning agent's action methods and backup function.
    '''

    def setUp(self):
        self.agent = QLearningAgent(legal_actions=(0, 1),
                                    gamma=0.9, alpha=0.25, epsilon=0.9)

        self.agent.Q[0, 1] = 1.0
        self.agent.Q[0, 2] = 0.5
        self.agent.Q[1, 1] = -2.0
        self.agent.Q[1, 2] = -1.0

    def test_get_greedy_action(self):
        self.assertEqual(self.agent._get_greedy_action(0), 1)
        self.assertEqual(self.agent._get_greedy_action(1), 2)

    def test_get_random_action(self):
        self.assertIn(self.agent._get_random_action(),
            self.agent.legal_actions)

    def test_td_error(self):
        self.assertTrue(abs(self.agent._td_error(0, 0, 1, 1.9) - 1.0)\
         < 1e-10)
        self.assertTrue(abs(self.agent._td_error(0, 0, 1, -0.5) + 1.4)\
         < 1e-10)
示例#2
0
    def setUp(self):
        self.agent = QLearningAgent(legal_actions=(0, 1),
                                    gamma=0.9, alpha=0.25, epsilon=0.9)

        self.agent.Q[0, 1] = 1.0
        self.agent.Q[0, 2] = 0.5
        self.agent.Q[1, 1] = -2.0
        self.agent.Q[1, 2] = -1.0
def run_sarsa_vs_qlearning():
    winners = []
    board_length = 8
    action_space = (board_length, board_length, board_length, board_length)

    agent_one = QLearningAgent((board_length, board_length), action_space,
                               "qlearning", "up", 0.0, 250000000, 10000000)
    agent_two = SARSAAgent((board_length, board_length), action_space, "sarsa",
                           "down", 0.0, 25000000, 10000000)
    iterations = 10000
    for i in range(iterations):
        board = Board(board_length=8)
        game = Game(agent_one=agent_one, agent_two=agent_two, board=board)
        game.play(verbose=False)
        winners += [game.winner]
        agent_one.epsilon *= 0.9999
        agent_two.epsilon *= 0.9999
        if (i % 5000 == 0 and i > 0) or iterations - 1 == i:
            victories_player_two = 0
            victories_player_one = 0
            for winner in winners:
                if winner == "qlearning":
                    victories_player_one += 1
                if winner == "sarsa":
                    victories_player_two += 1

            logging.info("Player One: {}".format(str(victories_player_one)))
            logging.info("Player Two: {}".format(str(victories_player_two)))
            logging.info("Mean Rewards Agent One: {}".format(
                agent_one.moving_average_rewards[-1]))
            logging.info("Mean Rewards Agent Two: {}".format(
                agent_two.moving_average_rewards[-1]))
示例#4
0
    def __init__(self, num_agents, **kwargs):
        super().__init__(**kwargs)
        self.agents = [QLearningAgent(**kwargs) for i in range(num_agents)]
        self.name = 'MultiAgent'

        qTable = self.agents[0].qValues
        for agent in self.agents[1:]:
            agent.qValues = qTable
示例#5
0
def run(algo="sarsa",
        num_episodes=500,
        min_seed=0,
        seed_range=20,
        kings_move_allowed=False,
        stochastic_wind=False,
        num_steps_lim=100000,
        info=""):
    time_steps = []
    episode_lengths = []
    episode_rewards = []

    print(
        "======================================================================================="
    )
    print("Running {algo} agent on windy gridworld {info} [seeds = {s}-{e}]".
          format(algo=algo, info=info, s=min_seed,
                 e=min_seed + seed_range - 1))
    print(
        "======================================================================================="
    )
    for seed in range(min_seed, min_seed + seed_range):
        env = WindyGridworldEnv(kings_move_allowed,
                                seed=seed,
                                stochastic_wind=stochastic_wind)
        if algo == "sarsa":
            agent = SarsaAgent(num_states=env.nS,
                               actions=range(env.nA),
                               seed=seed)
        elif algo == "q-learning":
            agent = QLearningAgent(num_states=env.nS,
                                   actions=range(env.nA),
                                   seed=seed)
        elif algo == 'expected-sarsa':
            agent = ExpectedSarsaAgent(num_states=env.nS,
                                       actions=range(env.nA),
                                       seed=seed)
        experiment = Experiment(env, agent)
        expt_time_steps, expt_episode_lengths, expt_episode_rewards = experiment.run(
            num_episodes, num_steps_lim, algo=algo)
        time_steps.append(expt_time_steps)
        episode_lengths.append(expt_episode_lengths)
        episode_rewards.append(expt_episode_rewards)

    time_steps = np.mean(np.array(time_steps), axis=0)
    episode_lengths = np.mean(np.array(episode_lengths), axis=0)
    episode_rewards = np.mean(np.array(episode_rewards), axis=0)

    print("Average Last Episode Length (across seeds): {}".format(
        episode_lengths[-1]))
    print("Average Last Episode Reward (across seeds): {}".format(
        episode_rewards[-1]))
    print("Average Episode Length (over last 20 episodes, across seeds): {}".
          format(np.mean(episode_lengths[-20])))
    print("Average Episode Reward (over last 20 episodes, across seeds): {}".
          format(np.mean(episode_rewards[-20])))

    return time_steps, episode_lengths, episode_rewards
示例#6
0
    def run(self):
        print('\tValue Iteration')
        vi_agent = ValueIterationAgent(self.env, self.gamma)
        print('\t\tAverage reward: ' + str(np.mean(vi_agent.scores)))
        print('\t\tConvergence step: ' + str(vi_agent.convergence))
        print('\t\tPolicy: ' + str(vi_agent.policy))

        print('\tPolicy Iteration')
        self.env.reset()
        pi_agent = PolicyIterationAgent(self.env, self.gamma)
        print('\t\tAverage reward: ' + str(np.mean(pi_agent.scores)))
        print('\t\tConvergence step: ' + str(pi_agent.convergence))
        print('\t\tPolicy: ' + str(pi_agent.policy))

        print('\tQ Learning')
        self.env.reset()
        ql_agent = QLearningAgent(self.env)
        q, stats = ql_agent.q_learning(self.env, 500)
        plotting.plot_episode_stats(stats, experiment_name=self.name)
示例#7
0
def fig_6_4():
    mdp = CliffGridworld()

    sarsa_sum_rewards = []
    qlearning_sum_rewards = []
    rewards_history = deque(maxlen=10)

    n_episodes = 500

    # run sarsa agent
    agent = SarsaAgent(mdp=mdp)
    for i in range(n_episodes):
        states, actions, rewards = agent.run_episode()
        rewards_history.append(rewards)
        sarsa_sum_rewards.append(np.mean(rewards_history))
    print_optimal_policy(mdp, agent)
    print_values(mdp, agent)

    rewards_history.clear()

    # run q learning
    agent = QLearningAgent(mdp=mdp)
    for i in range(n_episodes):
        states, actions, rewards = agent.run_episode()
        rewards_history.append(rewards)
        qlearning_sum_rewards.append(np.mean(rewards_history))

    print_optimal_policy(mdp, agent)
    print_values(mdp, agent)

    # plot results
    plt.plot(np.arange(n_episodes), sarsa_sum_rewards, label='Sarsa')
    plt.plot(np.arange(n_episodes), qlearning_sum_rewards, label='Q-learning')
    plt.ylim(-100, 0)
    plt.xlim(0, 500)
    plt.xlabel('Episodes')
    plt.ylabel('Sum of rewards during episode')
    plt.legend()

    plt.savefig('figures/ch06_fig_6_4.png')
    plt.close()
示例#8
0
def fig_6_7():
    mdp = MDP()
    epsilon = 0.1
    alpha = 0.1
    agents = [QLearningAgent(mdp=mdp, epsilon=epsilon, alpha=alpha),
              QQAgent(mdp=mdp, epsilon=epsilon, alpha=alpha)]

    n_runs = 10000
    n_episodes = 300

    actions_taken = np.zeros((len(agents), n_runs, n_episodes))
    rewards_received = np.zeros_like(actions_taken)

    for r in tqdm(range(n_runs)):
        # reset the agents
        for a in agents:
            a.reset()

        for e in range(n_episodes):
            for i, a in enumerate(agents):
                _, actions, rewards = a.run_episode()
                actions_taken[i, r, e] = actions[0]  # only plotting % left action from A (which is the first action in the sequence
                rewards_received[i, r, e] = rewards

    actions_fraction = np.mean(actions_taken - 1, axis=1) / -2

    plt.plot(np.arange(n_episodes), actions_fraction[0], label='Q-learning')
    plt.plot(np.arange(n_episodes), actions_fraction[1], label='Double Q-learning')
    optimal = epsilon/len(mdp.get_possible_actions(mdp.start_state))
    plt.gca().axhline(optimal, linestyle='dashed', lw=0.5, label=r'{:.0%} optimal at $\epsilon\$'.format(optimal), c='black')
    plt.xlim(0,n_episodes)
    plt.ylim(0,1)
    plt.xlabel('Episodes')
    plt.ylabel('% left actions from A')
    plt.legend()


    plt.savefig('figures/ch06_fig_6_7.png')
    plt.close()
示例#9
0
def fig_6_6():
    # initialize
    mdp = CliffGridworld()
    agents = [
        SarsaAgent(mdp=mdp),
        ExpectedSarsaAgent(mdp=mdp),
        QLearningAgent(mdp=mdp)
    ]
    alphas = np.linspace(0.1, 1, 10)
    plt.figure(figsize=(10, 8))

    def run_experiment(mdp, agents, n_runs, n_episodes, alphas=alphas):
        avg_sum_rewards = np.zeros((len(agents), len(alphas)))
        for i, alpha in enumerate(alphas):
            # set new alpha for each agent
            for a in agents:
                a.alpha = alpha

            for r in tqdm(range(n_runs)):
                # rest q_values on each run
                for a in agents:
                    a.reset()
                # reset averages over episodes at each run
                avg_over_episodes = np.zeros(len(agents))

                for e in range(n_episodes):
                    for j, a in enumerate(agents):
                        _, _, rewards = a.run_episode()
                        # update avg over episodes online
                        avg_over_episodes[j] += 1 / (e + 1) * (
                            rewards - avg_over_episodes[j])

                # update avg over runs online
                avg_sum_rewards[:, i] += 1 / (r + 1) * (avg_over_episodes -
                                                        avg_sum_rewards[:, i])
        return avg_sum_rewards

    # run interim
    print('Running interim')
    avg_sum_rewards = run_experiment(mdp, agents, n_runs=500, n_episodes=100)

    # plot results
    plt.plot(alphas,
             avg_sum_rewards[0],
             label='Sarsa (interim)',
             linestyle='dotted',
             c='blue',
             lw=0.5,
             marker='v',
             markerfacecolor='none')
    plt.plot(alphas,
             avg_sum_rewards[1],
             label='Expected Sarsa (interim)',
             linestyle='dotted',
             c='red',
             lw=0.5,
             marker='x')
    plt.plot(alphas,
             avg_sum_rewards[2],
             label='Q-learning (interim)',
             linestyle='dotted',
             c='black',
             lw=0.5,
             marker='s',
             markerfacecolor='none')

    # run asymptotic
    print('Running asymptotic')
    avg_sum_rewards = run_experiment(mdp, agents, n_runs=10, n_episodes=1000)

    # plot results
    plt.plot(alphas,
             avg_sum_rewards[0],
             label='Sarsa (asymptotic)',
             c='blue',
             lw=0.5,
             marker='v',
             markerfacecolor='none')
    plt.plot(alphas,
             avg_sum_rewards[1],
             label='Expected Sarsa (asymptotic)',
             c='red',
             lw=0.5,
             marker='x')
    plt.plot(alphas,
             avg_sum_rewards[2],
             label='Q-learning (asymptotic)',
             c='black',
             lw=0.5,
             marker='s',
             markerfacecolor='none')

    # format plot
    plt.xlabel(r'$\alpha$')
    plt.xlim(0.1, 1)
    plt.xticks(np.linspace(0.1, 1, 10))
    plt.ylim(-150, 0)
    plt.ylabel('Sum of rewards per episode')
    plt.legend()

    plt.savefig('figures/ch06_fig_6_6.png')
    plt.close()
示例#10
0
        'explored_states_avg': float(explored_states_cum) / float(iterations_count),
        'reward_cum_avg': float(reward_count) / float(iterations_count),
        'actions_avg': float(actions_count) / float(iterations_count),
    }

    stats.append(row)

for q_init_value in [0.0]:
    for alpha_rate in [0.5]:
        for epsilon_rate in [0.0]:
            for gamma_rate in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]:
                print "Simulating for q_init_value: {}, alpha_rate: {}, epsilon_rate: {}, gamma_rate: {}". \
                    format(q_init_value, alpha_rate, epsilon_rate, gamma_rate)
                e = Environment()
                a = QLearningAgent(
                    e,
                    alpha_rate=alpha_rate, epsilon_rate=epsilon_rate, gamma_rate=gamma_rate, q_init_value=q_init_value
                )
                e.set_primary_agent(a, enforce_deadline=True)
                s = Simulator(e, update_delay=0.0000001, display=False)
                s.run(n_trials=n_trials)

                stats_by_iteration = a.stats_by_simulation_get()
                aggregated_stats_build_row(q_init_value, alpha_rate, epsilon_rate, gamma_rate, stats_by_iteration)

df = pd.DataFrame(
    data=stats, columns=['q_init_value', 'alpha_rate', 'epsilon_rate', 'gamma_rate', 'success_perc',
                         'traffic_violations_avg', 'explored_states_avg', 'reward_cum_avg', 'actions_avg']
)

df.to_csv('qlearn_agent_tuned_grid_for_gamma.csv')
示例#11
0
from dungeon_game import Dungeon
from agents import RandomAgent, AccountantAgent, QLearningAgent, DeepQLearningAgent

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--agent', type=str, default='RANDOM')
    parser.add_argument('--learning_rate', type=float, default=0.1)
    parser.add_argument('--discount', type=float, default=0.95)
    parser.add_argument('--iterations', type=int, default=5000)
    parser.add_argument('--plot', action='store_true')
    FLAGS, unparsed = parser.parse_known_args()

    randomAgent = RandomAgent()
    accountantAgent = AccountantAgent()
    qLearningAgent = QLearningAgent(iterations=FLAGS.iterations)
    deepQLearningAgent = DeepQLearningAgent(iterations=FLAGS.iterations)

    agent_list = [randomAgent, accountantAgent, qLearningAgent, deepQLearningAgent]
    rewards = [list() for _ in range(len(agent_list))]
    dungeon_list = [Dungeon() for _ in range(len(agent_list))]

    for agent_number, agent in enumerate(agent_list):
        for step in range(FLAGS.iterations):
            old_state = dungeon_list[agent_number].state
            action = agent.get_next_action(old_state)
            new_state, reward = dungeon_list[agent_number].perform_action(action)
            agent.update(old_state, new_state, action, reward)

            rewards[agent_number].append(reward)
示例#12
0
                pickle.dump(p, open(file_name, "wb"))
    except KeyboardInterrupt:
        print('Interrupted')
        for pick in pickle_index:
            player = players[pick]
            player.last_state = None
            file_name = directory + "\\" + "QLAGENT_GAMES_" + str(
                num_games) + get_time_stamp() + ".p"
            pickle.dump(player, open(file_name, "wb"))
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)


def get_time_stamp():
    time_stamp = dt.datetime.now()
    day = str(time_stamp.day)
    hour = str(time_stamp.hour)
    minute = str(time_stamp.minute)
    second = str(time_stamp.second)
    full_ft = '-'.join([day, hour, minute, second])
    return full_ft


if __name__ == "__main__":
    ql_agent = QLearningAgent("Learning Agent Demo")
    random_agent = RandomAgent("Random Agent")
    players = [ql_agent, random_agent]
    run_x_games_and_pickle(players, 10000, pickle_index=[ql_agent.index])
示例#13
0
    world[round(WORLD_DIM / 2) - 1, WORLD_DIM - 1] = '♚'  #in caso sia sparito
    for i in range(WORLD_DIM):
        for j in range(WORLD_DIM):
            if world[i, j] in ['☠', '█', '♚']:
                continue
            action = np.argmax(qtable[i, j])
            if action == 0:
                world[i, j] = '⬆'
            elif action == 1:
                world[i, j] = '⬇'
            elif action == 2:
                world[i, j] = '⬅'
            elif action == 3:
                world[i, j] = '➡'

    renderer = Renderer(world, cell_size=100)
    return renderer.render(episode)


if __name__ == '__main__':
    from agents import QLearningAgent, WORLD_DIM
    from CreateBilboWorld import *
    import os
    import numpy as np

    bilbo = QLearningAgent(PLAYER_CHAR)
    mondo = World(WORLD_DIM, bilbo=bilbo, obstacle=True)
    qtable = np.load("./models/qtable_" + str(WORLD_DIM) + ".npy")
    img = render_world(mondo.world, WORLD_DIM, qtable)
    img.show()
示例#14
0
from simulator import Simulator
from environment import Environment
from agents import QLearningAgent

environment = Environment(debug_traces=True)

qlearn_agent_tuned = QLearningAgent(environment,
                                    alpha_rate=0.5,
                                    epsilon_rate=0.0,
                                    gamma_rate=0.5,
                                    q_init_value=0.0,
                                    debug_traces=True)

environment.set_primary_agent(qlearn_agent_tuned, enforce_deadline=True)

simulator = Simulator(environment,
                      update_delay=0.00001,
                      display=False,
                      debug_traces=True)
simulator.run(n_trials=100)

df = qlearn_agent_tuned.stats_by_simulation_get_as_df()
df.to_csv('stats_tuned_qlearn_agent.csv')
示例#15
0
import sys

if "../" not in sys.path:
    sys.path.append("../")

from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting
from agents import QLearningAgent
import numpy as np

env_shape = (4, 12)
start_position = (3, 0)
end_positions = [(3, 11)]
cliff = tuple((3, i + 1) for i in range(10))

env = CliffWalkingEnv(env_shape, start_position, end_positions, cliff)
n_actions = env.action_space.n
agent = QLearningAgent(alpha=0.5,
                       epsilon=0.1,
                       discount=0.99,
                       n_actions=n_actions)

agent.train(env,
            n_episodes=1000,
            t_max=10**3,
            verbose=True,
            verbose_per_episode=500)

plotting.draw_policy(env, agent)
plotting.plot_episode_stats(agent)
示例#16
0
from agents import RandomAgent, QLearningAgent
from spades import run_x_games_and_pickle

if __name__ == "__main__":
    players_4 = [
        QLearningAgent("Learning Agent"),
        RandomAgent("Random1"),
        RandomAgent("Random2"),
        RandomAgent("Random3")
    ]
    run_x_games_and_pickle(players_4, 30000, pickle_index=["Learning Agent"])
示例#17
0
from simulator import Simulator
from environment import Environment
from agents import QLearningAgent

environment = Environment(debug_traces=True)

qlearn_agent_tuned = QLearningAgent(
    environment,
    alpha_rate=0.5,
    epsilon_rate=0.0,
    gamma_rate=0.5,
    q_init_value=0.0,
    debug_traces=True
)

environment.set_primary_agent(qlearn_agent_tuned, enforce_deadline=True)

simulator = Simulator(environment, update_delay=0.5, display=True, debug_traces=True)
simulator.run(n_trials=5)

simulator = Simulator(environment, update_delay=0.0000001, display=False, debug_traces=True)
simulator.run(n_trials=90)

simulator = Simulator(environment, update_delay=1.00, display=True, debug_traces=True)
simulator.run(n_trials=5)
#
# df = qlearn_agent_tuned.stats_by_simulation_get_as_df()
# df.to_csv('stats_tuned_qlearn_agent.csv')
示例#18
0
import numpy as np
import gym

import plotting
from agents import ValueIterationAgent, PolicyIterationAgent, QLearningAgent

gamma = .1

# Value Iteration
#######################################################
env = gym.make('FrozenLake-v0')
agent = ValueIterationAgent(env, gamma)
print('Average reward: ' + str(np.mean(agent.scores)))

# Policy Iteration
#######################################################
env = gym.make('FrozenLake-v0')
agent = PolicyIterationAgent(env, gamma)
print('Average reward: ' + str(np.mean(agent.scores)))

# Q Learning
#######################################################
env = gym.make('FrozenLake-v0')
agent = QLearningAgent(env)
Q, stats = agent.q_learning(env, 500)

plotting.plot_episode_stats(stats)
示例#19
0
possible_moves = {'up': 0, 'down': 1, 'left': 2, 'right': 3}
inverse_possible_moves = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}

file_name = "./models/qtable_" + str(WORLD_DIM)


q_table=np.array([[[0.0 for moves in possible_moves]
           for x in range(WORLD_DIM)]
           for y in range(WORLD_DIM)]) \
           if not os.path.isfile(file_name + ".npy") \
           else np.load(file_name + ".npy")

fig = plt.figure(figsize=(20, 20))

bilbo = QLearningAgent(PLAYER_CHAR)
mondo = World(WORLD_DIM, bilbo=bilbo, obstacle=True)
game_ended = False
epoch = 0
anim = []
rewards = 0

env = mondo.create_env(d)
anim.append((plt.pcolormesh(env, cmap='CMRmap'), ))

while not game_ended and epoch < MAX_EPOCH:
    epoch += 1
    action = bilbo.get_action(0, q_table, possible_moves)
    bilbo.move(inverse_possible_moves[action])()
    game_ended = bilbo.game_ended()
    reward = bilbo.reward()
示例#20
0
 def fullTest(self):
     self.game.addAgent(ExpectimaxAgent(BlackjackPlayer.id))
     self.result_table[1] = 0
     self.game.addAgent(QLearningAgent(BlackjackPlayer.id))
     self.result_table[2] = 0
     self.doTests()
示例#21
0
from simulator import Simulator
from environment import Environment
from agents import QLearningAgent

environment = Environment(debug_traces=False)

qlearn_agent = QLearningAgent(environment)

environment.set_primary_agent(qlearn_agent, enforce_deadline=True)

simulator = Simulator(environment, update_delay=0.00001, display=False)
simulator.run(n_trials=100)

df = qlearn_agent.stats_by_simulation_get_as_df()
df.to_csv('main_q-agent_first_stats.csv')
示例#22
0
 def __init__(self, agent=QLearningAgent(id=1)):
     super().__init__()
     self.learning_agent = agent
     self.addPlayers()
示例#23
0
 def setUp(self) -> None:
     self.test_players = [QLearningAgent(1), RandomAgent(2)]
     self.game = spades.Spades(self.test_players)
示例#24
0
           else np.load(file_name + ".npy")

fig = plt.figure(figsize=(20, 20))

epochs = []
rewards = []
epsilons = []
tot_wins = []
tot_loss = []
policy = []
win = 0
loss = 0

for ep in range(TOT_EPISODES):
  #recreate the environment
    bilbo = QLearningAgent(PLAYER_CHAR)
    mondo = World(WORLD_DIM, bilbo=bilbo, obstacle=True)
    np.random.seed()
    game_ended = False
    epoch = 0
    tot_reward = 0
    #if ep % 10 == 0:
        #a = plt.imshow(render_world(mondo.world,WORLD_DIM,q_table,ep), animated=True)
        #policy.append((a,))
    while not game_ended and epoch < MAX_EPOCH:
      #the near it gets to the dragon the more random the movement
        epoch += 1
        epsilon_fear = bilbo.fear(epsilon)
        action = bilbo.get_action(epsilon, q_table, possible_moves)
        current_state = bilbo.get_current_state()
      #treasure_gone = bilbo.treasure_gone()
示例#25
0
 def testQLearn(self):
     self.game.addAgent(QLearningAgent(BlackjackPlayer.id))
     self.result_table[1] = 0
     self.doTests()