Exemplo n.º 1
0
def run_agent_Q_RMS_param(num_runs, num_episodes, discount, step_size, step=1, agent_type="Sarsa"):
    """ Run the n-step Sarsa agent and return the avg Q-value RMS over episodes and runs """
    mdp = RandomWalk(19, -1)
    s = mdp.init()

    # ground truth for value
    gt_v = np.asarray(mdp.Q_equiprobable(discount)[1:-1])
    # Arrays for RMS error over all states
    rms_err = np.asarray([0.0] * num_episodes)
    sum_rms_err = 0.0

    # create n-step agent
    print("Starting agent {}-step {}".format(step, agent_type))
    if agent_type.lower() == "sarsa":
        agent = Sarsa(mdp, s, step)
    elif agent_type.lower() == "expsarsa":
        agent = ExpSARSA(mdp, s, step)
    elif agent_type.lower() == "treebackup":
        agent = TreeBackup(mdp, s, step)
    elif agent_type.lower() == "qsigma":
        agent = QSigma(mdp, 0.5, s, step)
    else:
        raise Exception("Wrong type of agent")

    for run in range(num_runs):
        for i in range(num_episodes):
            agent.episode(discount, step_size, 10000)
            agent.init()
            rms_err[i] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1]) - gt_v)))
        sum_rms_err += np.sum(rms_err)
        # Reset Q after a run
        agent.reset_Q()

    # averaged over num_runs and num_episodes
    return sum_rms_err / (num_runs * num_episodes)
Exemplo n.º 2
0
def run_agent_RMS_value(num_runs, num_episodes, discount, step_size, step=1):
    """ Run SARSA agent for num_episodes to get the state values """
    mdp = RandomWalk(19, -1)
    s = mdp.init()

    # ground truth for value
    gt_v = np.asarray(mdp.value_equiprobable(discount)[1:-1])
    # initial value
    init_v = np.asarray([0.5] * mdp.num_states())[1:-1]

    # Arrays for RMS error over all states
    rms_err = np.asarray([0.0] * (num_episodes + 1))
    sum_rms_err = np.asarray([0.0] * (num_episodes + 1))
    rms_err[0] = np.sqrt(np.mean(np.square(init_v - gt_v)))

    # create n-step SARSA agent
    agent = Sarsa(mdp, s, step)

    for run in range(num_runs):
        for i in range(num_episodes):
            agent.episode(discount, step_size, 10000)
            agent.init()
            rms_err[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q_to_value()[1:-1]) - gt_v)))
        sum_rms_err += rms_err
        # Reset Q after a run
        agent.reset_Q()

    # averaged over num_runs
    return sum_rms_err / num_runs
Exemplo n.º 3
0
Arquivo: SARSA.py Projeto: DZ9/qsigma
def example_randomwalk():
    # create an MDP
    env = RandomWalk(19)

    # create 1-step SARSA agent
    agent = Sarsa(env, env.init(), 1)
    agent2 = Sarsa(env, env.init(), 1)

    # act using equiprobable random policy with discount = 0.9 and step size = 0.1
    num_episode = 100
    for iter in range(num_episode):
        agent.episode(0.9, 0.1)
        agent.init()

    agent2.set_policy_eps_greedy(0.5)
    for iter in range(num_episode):
        agent2.episode(0.9, 0.1)
        agent2.init()

    print('Equiprobable Q_SARSA[s][a]', agent.Q)
    print('Eps greedy   Q_SARSA[s][a]', agent2.Q)
Exemplo n.º 4
0
def run_agent_value(num_episodes, discount, step_size, step=1):
    """ Run SARSA agent for num_episodes to get the state values"""
    mdp = RandomWalk(19)
    s = mdp.init()
    step = step

    # create n-step SARSA agent
    agent = Sarsa(mdp, s, step)
    for i in range(num_episodes):
        agent.episode(discount, step_size)
        agent.init()

    return agent.Q_to_value()
Exemplo n.º 5
0
def example_randomwalk():
    """ An example on random walk MDP """
    # create an MDP
    env = RandomWalk(19, -1)

    # create n-step TreeBackup agent
    agent = TreeBackup(env, env.init(), 3)
    agent2 = TreeBackup(env, env.init(), 3)

    # act using equiprobable random policy with discount = 0.9 and step size = 0.1
    num_episode = 1000
    for iter in range(num_episode):
        agent.episode(0.9, 0.1)
        agent.init()

    agent2.set_policy_eps_greedy(0.1)
    for iter in range(num_episode):
        agent2.episode(0.9, 0.1)
        agent2.init()

    print('Q_DP[s][a]   ', env.Q_equiprobable(0.9))
    print('Q_eps_greedy[s][a]   ', env.Q_eps_greedy(0.1, 0.9))
    print('Equiprobable Q_TreeBackup[s][a]', agent.Q)
    print('Eps greedy   Q_TreeBackup[s][a]', agent2.Q)
Exemplo n.º 6
0
Arquivo: Qsigma.py Projeto: DZ9/qsigma
def example_randomwalk():
  """ An example on random walk MDP """
  # create an MDP
  env = RandomWalk(19, -1)

  # create n-step QSigma agent
  agent = QSigma(env, 0.5, env.init(), 3) #Psigma=0.5, init_state=env.init(), steps=3
  agent2 = QSigma(env, 0.5, env.init(), 3)

  # act using equiprobable random policy with discount = 0.9 and step size = 0.1
  num_episode = 1000
  for iter in range(num_episode):
    agent.episode(0.9, 0.1)
    agent.init()

  agent2.set_policy_eps_greedy(0.1)
  for iter in range(num_episode):
   agent2.episode(0.9, 0.1)
   agent2.init()

  print('Q_DP[s][a]   ', env.Q_equiprobable(0.9))
  print('Q_eps_greedy[s][a]   ', env.Q_eps_greedy(0.1, 0.9))
  print('Equiprobable Q_Q(sigma)[s][a]', agent.Q)
  print('Eps greedy   Q_Q(sigma)[s][a]', agent2.Q)
Exemplo n.º 7
0
def decay_agent(n=1, alpha=0.5, episodes=100, ep_start=30, decay=0.7):
    """ Run an agent for specified n-step Qsigma method with sigma decay"""
    mdp = RandomWalk(19, -1)
    s = mdp.init()
    num_runs = 250
    num_episodes = episodes
    discount = 1.0
    step_size = alpha
    steps = n

    # Arrays for sum of rewards for each episodes
    Q_opt = mdp.Q_equiprobable(1.0)
    rms_err = 0.0

    # create n-step Qsigma agent
    agent = QSigma(mdp, 1.0, s, steps)
    agent.set_policy_equiprobable()

    for run in range(num_runs):
        sqerr = 0.0
        agent._Psigma = 1.0
        for i in range(num_episodes):
            if i > ep_start:
                agent._Psigma *= decay
            agent.episode(discount, step_size)
            agent.init()
        count = 0
        for s in range(mdp.num_states()):
            for a in range(mdp.num_actions(s)):
                count += 1
                sqerr += (1 / count) * (
                    (agent.Q[s][a] - Q_opt[s][a])**2 - sqerr)
        rms_err += sqerr**0.5
        # Reset Q after a run
        agent.reset_Q()

    rms_err /= num_runs

    return rms_err
Exemplo n.º 8
0
def run_agent_RMS_Q(num_runs, num_episodes, discount, step_size, step=1):
    """ Run SARSA agent for num_episodes to get the Q values """
    mdp = RandomWalk(19)
    s = mdp.init()

    # ground truth for Q
    gt_Q = np.asarray(mdp.Q_equiprobable(discount)[1:-1])
    gt_Q_left = gt_Q[:, 0]
    gt_Q_right = gt_Q[:, 1]

    v = np.asarray([0.5] * mdp.num_states())
    v[0], v[-1] = 0.0, 0.0
    init_Q_left = np.asarray(mdp.value_to_Q(v, discount)[1:-1])[:, 0]
    init_Q_right = np.asarray(mdp.value_to_Q(v, discount)[1:-1])[:, 1]

    # Arrays for RMS error over all states
    rms_err_left = np.asarray([0.0] * (num_episodes + 1))  # Q[left]
    rms_err_right = np.asarray([0.0] * (num_episodes + 1))  # Q[right]
    sum_rms_err_left = np.asarray([0.0] * (num_episodes + 1))
    sum_rms_err_right = np.asarray([0.0] * (num_episodes + 1))
    rms_err_left[0] = np.sqrt(np.mean(np.square(init_Q_left - gt_Q_left)))
    rms_err_right[0] = np.sqrt(np.mean(np.square(init_Q_right - gt_Q_right)))

    # create n-step SARSA agent
    agent = Sarsa(mdp, s, step)
    for run in range(num_runs):
        for i in range(num_episodes):
            agent.episode(discount, step_size, 10000)
            agent.init()
            rms_err_left[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1])[:, 0] - gt_Q_left)))
            rms_err_right[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1])[:, 1] - gt_Q_right)))
        sum_rms_err_left += rms_err_left
        sum_rms_err_right += rms_err_right
        # Reset Q after a run
        agent.reset_Q()
    # averaged over num_runs
    return sum_rms_err_left / num_runs, sum_rms_err_right / num_runs