T = len(episode) - 1
		
		G = 0.0
		W = 1.0
		
		for t in range(T-1, -1, -1):
			# Get key variables from this episode step.
			St = episode[t].state
			At = episode[t].action
			Rt_1 = episode[t+1].reward

			# Update Q-values and visit counts.
			G = (LAMBDA * G) + Rt_1
			Q.increment_count(St, At, W)
			Qs_a = Q.get(St, At)
			new_Qs_a = Qs_a + (W / Q.get_count(St, At)) * (G - Qs_a)
			Q.set(St, At, new_Qs_a)
			
			# Update the policy.
			Pi.update(St, Q.get_max_action(St))
			
			# Stop this episode if it's no longer behaving greedily.
			if At != Pi.get_action(St):
				break
			
			W /= soft_policy.action_probability(St, At)
	
	plt.plot([x for x in range(TRAIN_STEPS)], rewards)
	plt.show()
	
	save_policy(Pi)