def __init__(self):
        self.state_size = rospy.get_param('/cartpole_v0/state_size')
        self.learning_rate = rospy.get_param('/cartpole_v0/learning_rate')
        action_size = rospy.get_param('/cartpole_v0/n_actions')
        gamma = rospy.get_param('/cartpole_v0/gamma')
        epsilon = rospy.get_param('/cartpole_v0/epsilon')
        epsilon_decay = rospy.get_param('/cartpole_v0/epsilon_decay')
        epsilon_min = rospy.get_param('/cartpole_v0/epsilon_min')
        batch_size = rospy.get_param('/cartpole_v0/batch_size')

        QLearningAgent.__init__(self,
                                state_size=self.state_size,
                                action_size=action_size,
                                gamma=gamma,
                                epsilon=epsilon,
                                epsilon_decay=epsilon_decay,
                                epsilon_min=epsilon_min,
                                batch_size=batch_size)
Пример #2
0
def _main_learning_curve(plan_ids, n_iter):
    for plan_id in plan_ids:
        env = get_env(plan_id, default_reward=-0.001)
        mdp = env.getMDP()[1]
        non_terminal_states = mdp.keys()
        state, transitions = next(iter(mdp.items()))
        action_space = list(mdp[state].keys())
        
        evaluator = PolicyIterationAgent(mdp.keys(), env.action_space, mdp)
        evaluator.compute_best_policy()
        
        agents = [
            ("Optimal", "black", evaluator),
            ("Random", "gray", RandomAgent(env.action_space)),
            ("QLearning", "red", QLearningAgent(action_space)),
            ("Sarsa", "blue", SarsaAgent(action_space)),
            ("Dyna-Q", "green", DynaQAgent(action_space, non_terminal_states, lr_R=0.5, lr_P=0.5, k=5))
        ]
        
        
        res = [(name, color, get_learning_curve(env, agent, name=name, evaluator=evaluator, n_iter=n_iter))
               for name, color, agent in agents]

        plt.suptitle("map " + str(plan_id), fontsize=16)
        plt.title("Policy loss")
        for name, color, (policy_values, cum_reward, action_count) in res:
            if name == "Optimal":
                plt.plot(policy_values, "--", label=name, color=color)
            else:
                plt.plot(policy_values, label=name, color=color)
        plt.legend()
        
        plt.figure()
        plt.suptitle("map " + str(plan_id), fontsize=16)
        plt.title("Cumulated reward")
        for name, color, (policy_values, cum_reward, action_count) in res:
            if name == "Optimal":
                plt.plot(cum_reward, "--", label=name, color=color)
            else:
                plt.plot(cum_reward, label=name, color=color)
        plt.legend()
        plt.show()
Пример #3
0
def _main_demo(plan_id=0):
    env = get_env(plan_id)
    mdp = env.getMDP()[1]
    state, transitions = next(iter(mdp.items()))
    action_space = list(mdp[state].keys())
    
    agent = QLearningAgent(action_space)
    
    # env.render()  # permet de visualiser la grille du jeu
    env.render(mode="human")  #visualisation sur la console
    
    # Faire un fichier de log sur plusieurs scenarios
    episode_count = 1000
    FPS = 1e-6  # ~temps de pause entre deux affichages
    
    all_rsums = []
    
    for i in range(episode_count):
        obs = env.state2str(env.reset())
        agent.reset(obs)
        env.verbose = (i % 10 == 0 and i > 0)  # afficher 1 episode sur 10
        if env.verbose:
            env.render(FPS)
        j = 0
        rsum = 0
        while True:
            action = agent.act()
            obs, reward, done, _ = env.step(action)
            obs = env.state2str(obs)
            agent.get_result(obs, reward, done)
            rsum += reward
            j += 1
            if env.verbose:
                env.render(FPS)
            if done:
                print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " + str(j) + " actions")
                all_rsums.append(rsum)
                break
    
    print("done")
    print("Average rsum : {} +/- {}".format(np.mean(all_rsums), np.std(all_rsums)))
    env.close()
Пример #4
0
def simulate(side, instance, slip, obfuscate, randomseed, maxLength, gamma,
             num_episodes):
    env = Environment(side, instance, slip, obfuscate, randomseed, maxLength)
    agent = QLearningAgent(env, gamma, lr=0.8)
    episode_rewards = np.zeros(num_episodes)
    for i in range(num_episodes):
        event = 'continue'
        episode_reward = 0
        while event == 'continue':
            action = agent.getAction()  # Take action
            state, reward, event = env.step(action)
            agent.observe(state, reward, event)
            episode_reward += reward
        episode_rewards[i] = episode_reward
    # print(episode_rewards[-100:])
    avg = np.mean(episode_rewards[-100:])
    pi = agent.getPi()
    print("Slip: " + str(slip) + " Avg: " + str(avg))
    env.printPolicy(pi)
    # print(episode_rewards[-1000:])
    # print("Mean episode reward: {}".format(np.mean(episode_rewards[-1000:])))
    return round(avg, 4)
Пример #5
0
            break

    return total_reward


if __name__ == '__main__':
    env = gym.make("CartPole-v0").env
    env.reset()
    n_actions = env.action_space.n

    print(env.observation_space.high)
    print(env.observation_space.low)
    print('CartPole state: %s' % (env.reset()))

    agent = QLearningAgent(alpha=0.3,
                           epsilon=0.5,
                           discount=1.0,
                           get_legal_actions=lambda s: range(n_actions))

    rewards = []
    for i in range(2000):
        rewards.append(play_and_train(env, agent))
        agent.epsilon *= 0.999

        if i % 10 == 0:
            print('Iteration {}, Average reward {:.2f}, Epsilon {:.3f}'.format(
                i, np.mean(rewards), agent.epsilon))

    print('Reward of Test agent = %.3f' %
          play_and_train(env, agent, visualize=True))
Пример #6
0
from game import Game
from agent import Agent
from maxAgent import MaxAgent
from randomAgent import RandomAgent
from qlearning import QLearningAgent

qAgent = QLearningAgent(10000)
qAgent.train()

print('trained on 10000 games')
agent_two = MaxAgent()
wins = 0

for i in range(5000):
    game = Game()
    while not game.over():
        if game.turn_player() == 1:
            game.move(qAgent.move(game))
        else:
            game.move(agent_two.move(game))
    if game.score()[0] > game.score()[1]:
        wins = wins + 1

for i in range(5000):
    game = Game()
    while not game.over():
        if game.turn_player() == 2:
            game.move(qAgent.move(game))
        else:
            game.move(agent_two.move(game))
    if game.score()[1] > game.score()[0]:
Пример #7
0
    def render(self, mode='human', close=False):
        print(self.s)


def getLegalActions(s):
    legalActions = []
    for action in actions:
        if (action.pre(s)):
            legalActions.append(action)
    return legalActions


from qlearning import QLearningAgent
agent = QLearningAgent(alpha=0.5,
                       epsilon=0.5,
                       discount=0.99,
                       get_legal_actions=getLegalActions)


def play_and_train(env, agent, t_max=10**4):
    """
    This function should 
    - run a full game, actions given by agent's e-greedy policy
    - train agent using agent.update(...) whenever it is possible
    - return total reward
    """
    total_reward = 0.0
    s = env.reset()

    for t in range(t_max):
        # get agent to pick action given state s.
Пример #8
0
MESSAGE_BASE = 85                 # Distance from bottom to message area
SNOWMAN_BASE = 140                # Distance from bottom to Snowman base
MAX_INCORRECT_GUESSES = 8         # Number of incorrect guesses allowed
INCORRECT_COLOR = "#FF9999"       # Color used for incorrect guesses
CORRECT_COLOR = "#009900"         # Color used to mark correct guesses

# Fonts

WORD_FONT = "bold 36px 'Monaco','Monospaced'"
LETTER_FONT = "bold 24px 'Monaco','Monospaced'"
MESSAGE_FONT = "30px 'Helvetica Neue','Arial','Sans-Serif'"
gw = GWindow(GWINDOW_WIDTH, GWINDOW_HEIGHT)


env = BlackjackEnvironment()
agent = QLearningAgent(env)

def createWindow():
 
    
    def createCardTotalLabels():
        alphabet = ['02','03','04','05','06','07','08','09','10',
                    '11','12','13','14','15','16','17','18','19',
                    '20','21']
        alphabetLabels = [GLabel(letter) for letter in alphabet]
        for label in alphabetLabels:
            label.setFont(LETTER_FONT)    
        return alphabetLabels


    def createUpCardLabels():
Пример #9
0
import matplotlib.pyplot as plt
import seaborn as sns

from environment import Environment
from qlearning import QLearningAgent
from double_qlearning import DoubleQLearningAgent

sns.set()

env = Environment()
agent = QLearningAgent(env)
agent2 = DoubleQLearningAgent(env)

left_actions_ratio_a1 = agent.update_policy()
left_actions_ratio_a2 = agent2.update_policy()

fig, ax = plt.subplots()
ax.plot(range(len(left_actions_ratio_a1)),
        left_actions_ratio_a1,
        color="red",
        label="Q-Learning")
ax.plot(range(len(left_actions_ratio_a2)),
        left_actions_ratio_a2,
        color="green",
        label="Double Q-Learning")
ax.plot(range(len(left_actions_ratio_a1)), [5] * len(left_actions_ratio_a1),
        '--',
        color='black',
        label='optimal')
ax.set_xlabel("Number of episodes")
ax.set_ylabel("% of left actions from A")
Пример #10
0
                        default=False,
                        action='store_true')
    parser.add_argument('-v',
                        '--verbose',
                        dest="verbose",
                        action='store_true',
                        default=False)
    parser.add_argument('-s',
                        '--silent',
                        dest="silent",
                        action='store_true',
                        default=False)
    args = parser.parse_args()

    agent = QLearningAgent(TaxiEnv(env),
                           epsilon=args.epsilon,
                           alpha=args.alpha,
                           gamma=args.gamma)

    total_episodes = args.train_episodes + args.val_episodes + 1
    episode_start_val = args.train_episodes + 1
    train_timesteps_list = []
    val_timesteps_list = []

    cmd = None
    for i_episode in range(1, total_episodes):
        state = env.reset()
        done = 0
        t = 0
        reward = 0

        if i_episode >= episode_start_val: