# instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # get Q(s) so we can choose the first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5 / t) # epsilon-greedy biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() # we need the next action as well since Q(s,a) depends on Q(s',a') # if s2 not in policy then it's a terminal state, all Q are 0 old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * (r - model.predict(s, a)) * model.grad( s, a) else: # not terminal Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0]
t2 = 1.0 for it in range(20000): if it % 100 == 0: t += 10e-3 t2 += 0.01 if it % 1000 == 0: print(it) alpha = ALPHA / t2 s = (2, 0) grid.set_state(s) Qs = getQs(model, s) a = max_dict(Qs)[0] a = random_action(a, 0.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * ( r - model.predict(s, a) ) * model.grad(s, a) else: Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0]
# instead of 'generating' an epsiode, we will PLAY # an episode within this loop s = (2, 0) # start state grid.set_state(s) # get Q(s) so we can choose the first action Qs = getQs(model, s) # the first (s, r) tuple is the state we start in and 0 # (since we don't get a reward) for simply starting the game # the last (s, r) tuple is the terminal state and the final reward # the value for the terminal state is by definition 0, so we don't # care about updating it. a = max_dict(Qs)[0] a = random_action(a, eps=0.5/t) # epsilon-greedy biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() # we need the next action as well since Q(s,a) depends on Q(s',a') # if s2 not in policy then it's a terminal state, all Q are 0 old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha*(r - model.predict(s, a))*model.grad(s, a) else: # not terminal Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0] a2 = random_action(a2, eps=0.5/t) # epsilon-greedy
if it % 100 == 0: t += 0.01 t2 += 0.01 if it % 1000 == 0: print("it:", it) alpha = ALPHA / t2 s = (2,0) grid.set_state(s) Qs = getQs(model, s) a = max_dict(Qs)[0] a = random_action(a, eps=.5/t) biggest_change = 0 while not grid.game_over(): r = grid.move(a) s2 = grid.current_state() old_theta = model.theta.copy() if grid.is_terminal(s2): model.theta += alpha * (r - model.predict(s, a)) * model.grad(s, a) else: Qs2 = getQs(model, s2) a2 = max_dict(Qs2)[0] a2 = random_action(a2, eps=.5/t) model.theta += alpha * (r + GAMMA*model.predict(s2,a2) - model.predict(s, a))*model.grad(s,a)