def run_value_iteration(transition_probabilities=np.ones((5, 2)) * 0.5, rewards=[1, 0, 0, 0, 10], horizon=10, gamma=0.9): v = np.zeros(5) env = MarsRover(transition_probabilities, rewards, horizon) done = False state = env.reset() i = 0 while not done: i += 1 print(f"This is step {i}") r1 = (rewards[state - 1] + gamma * v[state - 1]) r2 = (rewards[state + 1] + gamma * v[state + 1]) action = np.argmax([r1, r2]) if r1 == r2: action = np.random.randint(2) new_state, reward, done = env.step(action) v, converged = update_value_function(v, state, new_state, reward) if converged: break state = new_state final_reward = evaluate_agent(v, env) print( f"Your agent achieved a final accumulated reward of {final_reward} after {i} update steps." ) return v, i, final_reward
def run_policy_iteration(transition_probabilities=np.ones((5, 2)), rewards=[1, 0, 0, 0, 10], horizon=10): env = MarsRover(transition_probabilities, rewards, horizon) qs = np.zeros((5, 2)) pi = np.random.randint(0, 2, size=5) done = False state = env.reset() i = 0 while not done: i += 1 print(f"This is step {i}") action = pi[state] new_state, reward, done = env.step(action) qs, pi, converged = update_policy(qs, pi, state, new_state, action, reward) if converged: break state = new_state final_reward = evaluate_policy(pi, env) print( f"Your policy achieved a final accumulated reward of {final_reward} after {i} update steps." ) return pi, i, final_reward
def run_value_iteration(transition_probabilities=np.ones((5, 2)) * 0.5, rewards=[1, 0, 0, 0, 10], horizon=10): env = MarsRover(transition_probabilities, rewards, horizon) done = False state = env.reset() i = 0 while not done: i += 1 print(f"This is step {i}") new_state, reward, done = env.step(action) final_reward = evaluate_agent(v, env) print( f"Your agent achieved a final accumulated reward of {final_reward} after {i} update steps." ) return v, i, final_reward
def evaluate_policy_dp(pi=np.random.randint(2, size=5), transition_probabilities=np.ones((5, 2)), rewards=[1, 0, 0, 0, 10]): env = MarsRover(transition_probabilities=transition_probabilities, rewards=rewards) i = 0 while True: i += 1 print(f"Policy was evaluated in {i} steps with resulting v {v}") return v, i