def play(): current_position_index = random.randint(0, len(s) - 1) current_position = s[current_position_index][:] global Q global total_time_steps total_rewards = 0 while current_position != [10, 10]: total_time_steps += 1 current_position_index = state_get_index(current_position[:]) optimal_action = action_get_index(current_position_index, Q) next_action = optimal_action next_position = probability_getter(current_position[:], next_action, p1, p2) next_position_index = state_get_index(next_position[:]) if next_position == [10, 10]: total_rewards = 500 else: total_rewards = -1 Q[state_get_index(current_position)][next_action] = Q[state_get_index( current_position)][:][next_action] + ALPHA * ( total_rewards + GAMMA * (Q[state_get_index(next_position)][:][action_get_index( next_position_index, Q)] - Q[state_get_index(current_position)][:][next_action])) current_position = next_position if current_position == [10, 10]: break
def play(): global total_time_steps current_position_index = random.randint(0,len(s)-1) current_position = s[current_position_index][:] optimal_action = action_get_index(current_position_index, Q) total_rewards = 0 while current_position != [10, 10]: total_time_steps +=1 randomizer = random.random() if randomizer <= EPSILON: next_action = random.randint(0, 3) while next_action is optimal_action: next_action = random.randint(0, 3) else: next_action = optimal_action next_position = probability_getter(current_position[:], next_action, p1, p2) next_position_index = state_get_index(next_position[:]) if next_position == [10, 10]: total_rewards = 500 else: total_rewards = -1 Q[state_get_index(current_position)][next_action] = Q[state_get_index(current_position)][:][next_action] + ALPHA*(total_rewards + GAMMA*(Q[state_get_index(next_position)][:][action_get_index(next_position_index, Q)]-Q[state_get_index(current_position)][:][next_action])) current_position = next_position current_position_index = next_position_index optimal_action = action_get_index(current_position_index, Q)
def play(): current_position_index = random.randint(0,len(s)-1) current_position = s[current_position_index][:] optimal_action = action_get_index(current_position_index, Q) global total_time_steps total_rewards = 0 while current_position != [10, 10]: total_time_steps += 1 randomizer = random.random() if randomizer <= EPSILON: next_action = random.randint(0, 3) while next_action is optimal_action: next_action = random.randint(0, 3) else: next_action = optimal_action next_position = probability_getter(current_position[:], next_action, p1, p2) next_position_index = state_get_index(next_position[:]) if next_position == [10, 10]: total_rewards = 500 else: total_rewards = -1 V = 0 test_action = action_get_index(next_position_index, Q[:]) for x in range(0, len(a)): if test_action != Q[next_position_index][:][x]: V += (EPSILON) * Q[next_position_index][:][x] else: V += (1-EPSILON) * action_get_index(next_position_index, Q[:]) Q[state_get_index(current_position)][next_action] = Q[state_get_index(current_position)][:][next_action] + ALPHA*(total_rewards + ((GAMMA*V)-Q[state_get_index(current_position)][:][next_action])) current_position = next_position current_position_index = next_position_index optimal_action = action_get_index(current_position_index, Q)
def play(current_position, action_according_to_policy, set_of_states, set_of_actions): global total_time_steps set_of_states = [] set_of_actions = [] set_of_states.append(current_position[:]) set_of_actions.append(action_according_to_policy) while current_position != [10, 10]: total_time_steps += 1 current_position = probability_getter(current_position, action_according_to_policy, p1, p2) current_position_index = state_get_index(current_position[:]) set_of_states.append(current_position[:]) randomize_action = random.random() if randomize_action <= (1 - EPSILON + (EPSILON / 4)): next_action = POL[current_position_index] else: next_action = random.randint(0, 3) while next_action is POL[current_position_index]: next_action = random.randint(0, 3) POL[current_position_index] = next_action action_according_to_policy = POL[current_position_index] set_of_actions.append(action_according_to_policy) update_values(set_of_states, set_of_actions)