예제 #1
0
def run_agent_Q_RMS_param(num_runs, num_episodes, discount, step_size, step=1, agent_type="Sarsa"):
    """ Run the n-step Sarsa agent and return the avg Q-value RMS over episodes and runs """
    mdp = RandomWalk(19, -1)
    s = mdp.init()

    # ground truth for value
    gt_v = np.asarray(mdp.Q_equiprobable(discount)[1:-1])
    # Arrays for RMS error over all states
    rms_err = np.asarray([0.0] * num_episodes)
    sum_rms_err = 0.0

    # create n-step agent
    print("Starting agent {}-step {}".format(step, agent_type))
    if agent_type.lower() == "sarsa":
        agent = Sarsa(mdp, s, step)
    elif agent_type.lower() == "expsarsa":
        agent = ExpSARSA(mdp, s, step)
    elif agent_type.lower() == "treebackup":
        agent = TreeBackup(mdp, s, step)
    elif agent_type.lower() == "qsigma":
        agent = QSigma(mdp, 0.5, s, step)
    else:
        raise Exception("Wrong type of agent")

    for run in range(num_runs):
        for i in range(num_episodes):
            agent.episode(discount, step_size, 10000)
            agent.init()
            rms_err[i] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1]) - gt_v)))
        sum_rms_err += np.sum(rms_err)
        # Reset Q after a run
        agent.reset_Q()

    # averaged over num_runs and num_episodes
    return sum_rms_err / (num_runs * num_episodes)
예제 #2
0
def decay_agent(n=1, alpha=0.5, episodes=100, ep_start=30, decay=0.7):
    """ Run an agent for specified n-step Qsigma method with sigma decay"""
    mdp = RandomWalk(19, -1)
    s = mdp.init()
    num_runs = 250
    num_episodes = episodes
    discount = 1.0
    step_size = alpha
    steps = n

    # Arrays for sum of rewards for each episodes
    Q_opt = mdp.Q_equiprobable(1.0)
    rms_err = 0.0

    # create n-step Qsigma agent
    agent = QSigma(mdp, 1.0, s, steps)
    agent.set_policy_equiprobable()

    for run in range(num_runs):
        sqerr = 0.0
        agent._Psigma = 1.0
        for i in range(num_episodes):
            if i > ep_start:
                agent._Psigma *= decay
            agent.episode(discount, step_size)
            agent.init()
        count = 0
        for s in range(mdp.num_states()):
            for a in range(mdp.num_actions(s)):
                count += 1
                sqerr += (1 / count) * (
                    (agent.Q[s][a] - Q_opt[s][a])**2 - sqerr)
        rms_err += sqerr**0.5
        # Reset Q after a run
        agent.reset_Q()

    rms_err /= num_runs

    return rms_err
예제 #3
0
def run_agent_RMS_Q(num_runs, num_episodes, discount, step_size, step=1):
    """ Run SARSA agent for num_episodes to get the Q values """
    mdp = RandomWalk(19)
    s = mdp.init()

    # ground truth for Q
    gt_Q = np.asarray(mdp.Q_equiprobable(discount)[1:-1])
    gt_Q_left = gt_Q[:, 0]
    gt_Q_right = gt_Q[:, 1]

    v = np.asarray([0.5] * mdp.num_states())
    v[0], v[-1] = 0.0, 0.0
    init_Q_left = np.asarray(mdp.value_to_Q(v, discount)[1:-1])[:, 0]
    init_Q_right = np.asarray(mdp.value_to_Q(v, discount)[1:-1])[:, 1]

    # Arrays for RMS error over all states
    rms_err_left = np.asarray([0.0] * (num_episodes + 1))  # Q[left]
    rms_err_right = np.asarray([0.0] * (num_episodes + 1))  # Q[right]
    sum_rms_err_left = np.asarray([0.0] * (num_episodes + 1))
    sum_rms_err_right = np.asarray([0.0] * (num_episodes + 1))
    rms_err_left[0] = np.sqrt(np.mean(np.square(init_Q_left - gt_Q_left)))
    rms_err_right[0] = np.sqrt(np.mean(np.square(init_Q_right - gt_Q_right)))

    # create n-step SARSA agent
    agent = Sarsa(mdp, s, step)
    for run in range(num_runs):
        for i in range(num_episodes):
            agent.episode(discount, step_size, 10000)
            agent.init()
            rms_err_left[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1])[:, 0] - gt_Q_left)))
            rms_err_right[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1])[:, 1] - gt_Q_right)))
        sum_rms_err_left += rms_err_left
        sum_rms_err_right += rms_err_right
        # Reset Q after a run
        agent.reset_Q()
    # averaged over num_runs
    return sum_rms_err_left / num_runs, sum_rms_err_right / num_runs
예제 #4
0
def example_randomwalk():
    """ An example on random walk MDP """
    # create an MDP
    env = RandomWalk(19, -1)

    # create n-step TreeBackup agent
    agent = TreeBackup(env, env.init(), 3)
    agent2 = TreeBackup(env, env.init(), 3)

    # act using equiprobable random policy with discount = 0.9 and step size = 0.1
    num_episode = 1000
    for iter in range(num_episode):
        agent.episode(0.9, 0.1)
        agent.init()

    agent2.set_policy_eps_greedy(0.1)
    for iter in range(num_episode):
        agent2.episode(0.9, 0.1)
        agent2.init()

    print('Q_DP[s][a]   ', env.Q_equiprobable(0.9))
    print('Q_eps_greedy[s][a]   ', env.Q_eps_greedy(0.1, 0.9))
    print('Equiprobable Q_TreeBackup[s][a]', agent.Q)
    print('Eps greedy   Q_TreeBackup[s][a]', agent2.Q)
예제 #5
0
파일: Qsigma.py 프로젝트: DZ9/qsigma
def example_randomwalk():
  """ An example on random walk MDP """
  # create an MDP
  env = RandomWalk(19, -1)

  # create n-step QSigma agent
  agent = QSigma(env, 0.5, env.init(), 3) #Psigma=0.5, init_state=env.init(), steps=3
  agent2 = QSigma(env, 0.5, env.init(), 3)

  # act using equiprobable random policy with discount = 0.9 and step size = 0.1
  num_episode = 1000
  for iter in range(num_episode):
    agent.episode(0.9, 0.1)
    agent.init()

  agent2.set_policy_eps_greedy(0.1)
  for iter in range(num_episode):
   agent2.episode(0.9, 0.1)
   agent2.init()

  print('Q_DP[s][a]   ', env.Q_equiprobable(0.9))
  print('Q_eps_greedy[s][a]   ', env.Q_eps_greedy(0.1, 0.9))
  print('Equiprobable Q_Q(sigma)[s][a]', agent.Q)
  print('Eps greedy   Q_Q(sigma)[s][a]', agent2.Q)
예제 #6
0
파일: exp_rw.py 프로젝트: DZ9/qsigma
n_runs = 100
n_eps = 100

ns = [1, 3, 5]
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
#sigmas = [0.0, 0.25, 0.5, 0.75, 1.0, -0.95]
sigmas = [-0.9]

#  def __init__(self, steps=1, init_sigma=1.0,step_size=0.1, beta=1.0):
gamma = 0.9
max_steps = 10000

# WANT RMS ERROR
mdp = RandomWalk(19, -1)
Q_opt = mdp.Q_equiprobable(gamma)
Q_opt[0] = [0, 0]
Q_opt[20] = [0, 0]
Q_opt = np.array(Q_opt)[1:-1]

if resume:
    R_final = pickle.load(open('rwQsig_R_final.p', 'rb'))
else:
    R_final = np.array(
        [[[[[0.0] * n_eps] * n_runs] * len(sigmas)] * len(alphas)] *
        len(ns))  # R_final[steps, alpha, sigma, run, ep]

for n, steps in enumerate(ns):
    for a, alpha in enumerate(alphas):
        for s, sigma in enumerate(sigmas):
            for run in range(1, n_runs + 1):