T = len(episode) - 1 G = 0.0 W = 1.0 for t in range(T-1, -1, -1): # Get key variables from this episode step. St = episode[t].state At = episode[t].action Rt_1 = episode[t+1].reward # Update Q-values and visit counts. G = (LAMBDA * G) + Rt_1 Q.increment_count(St, At, W) Qs_a = Q.get(St, At) new_Qs_a = Qs_a + (W / Q.get_count(St, At)) * (G - Qs_a) Q.set(St, At, new_Qs_a) # Update the policy. Pi.update(St, Q.get_max_action(St)) # Stop this episode if it's no longer behaving greedily. if At != Pi.get_action(St): break W /= soft_policy.action_probability(St, At) plt.plot([x for x in range(TRAIN_STEPS)], rewards) plt.show() save_policy(Pi)