Пример #1
0
def play():
    current_position_index = random.randint(0, len(s) - 1)
    current_position = s[current_position_index][:]
    global Q
    global total_time_steps
    total_rewards = 0
    while current_position != [10, 10]:
        total_time_steps += 1
        current_position_index = state_get_index(current_position[:])
        optimal_action = action_get_index(current_position_index, Q)
        next_action = optimal_action
        next_position = probability_getter(current_position[:], next_action,
                                           p1, p2)
        next_position_index = state_get_index(next_position[:])
        if next_position == [10, 10]:
            total_rewards = 500
        else:
            total_rewards = -1
        Q[state_get_index(current_position)][next_action] = Q[state_get_index(
            current_position)][:][next_action] + ALPHA * (
                total_rewards + GAMMA *
                (Q[state_get_index(next_position)][:][action_get_index(
                    next_position_index, Q)] -
                 Q[state_get_index(current_position)][:][next_action]))
        current_position = next_position
        if current_position == [10, 10]:
            break
Пример #2
0
def play():
	global total_time_steps
	current_position_index = random.randint(0,len(s)-1)
	current_position = s[current_position_index][:]
	optimal_action = action_get_index(current_position_index, Q)
	total_rewards = 0
	while current_position != [10, 10]:
		total_time_steps +=1
		randomizer = random.random()
		if randomizer <= EPSILON:
			next_action = random.randint(0, 3)
			while next_action is optimal_action:
				next_action = random.randint(0, 3)
		else:
			next_action = optimal_action
		next_position = probability_getter(current_position[:], next_action, p1, p2)
		next_position_index = state_get_index(next_position[:])
		if next_position == [10, 10]:
			total_rewards = 500
		else:
			total_rewards = -1
		Q[state_get_index(current_position)][next_action] = Q[state_get_index(current_position)][:][next_action] + ALPHA*(total_rewards + GAMMA*(Q[state_get_index(next_position)][:][action_get_index(next_position_index, Q)]-Q[state_get_index(current_position)][:][next_action]))
		current_position = next_position
		current_position_index = next_position_index
		optimal_action = action_get_index(current_position_index, Q)
Пример #3
0
def play():

	current_position_index = random.randint(0,len(s)-1)
	current_position = s[current_position_index][:]
	optimal_action = action_get_index(current_position_index, Q)
	global total_time_steps
	total_rewards = 0
	while current_position != [10, 10]:
		total_time_steps += 1
		randomizer = random.random()

		if randomizer <= EPSILON:
			next_action = random.randint(0, 3)
			while next_action is optimal_action:
				next_action = random.randint(0, 3)

		else:
			next_action = optimal_action

		next_position = probability_getter(current_position[:], next_action, p1, p2)

		next_position_index = state_get_index(next_position[:])
		if next_position == [10, 10]:
			total_rewards = 500
		else:
			total_rewards = -1

		V = 0
		test_action = action_get_index(next_position_index, Q[:])

		for x in range(0, len(a)):
			if test_action != Q[next_position_index][:][x]:
				V += (EPSILON) * Q[next_position_index][:][x]
			else:
				V += (1-EPSILON) * action_get_index(next_position_index, Q[:])



		Q[state_get_index(current_position)][next_action] = Q[state_get_index(current_position)][:][next_action] + ALPHA*(total_rewards + ((GAMMA*V)-Q[state_get_index(current_position)][:][next_action]))
		current_position = next_position
		current_position_index = next_position_index
		optimal_action = action_get_index(current_position_index, Q)
Пример #4
0
def play(current_position, action_according_to_policy, set_of_states,
         set_of_actions):

    global total_time_steps
    set_of_states = []
    set_of_actions = []
    set_of_states.append(current_position[:])
    set_of_actions.append(action_according_to_policy)

    while current_position != [10, 10]:
        total_time_steps += 1

        current_position = probability_getter(current_position,
                                              action_according_to_policy, p1,
                                              p2)

        current_position_index = state_get_index(current_position[:])

        set_of_states.append(current_position[:])

        randomize_action = random.random()

        if randomize_action <= (1 - EPSILON + (EPSILON / 4)):
            next_action = POL[current_position_index]
        else:
            next_action = random.randint(0, 3)
            while next_action is POL[current_position_index]:
                next_action = random.randint(0, 3)

        POL[current_position_index] = next_action

        action_according_to_policy = POL[current_position_index]

        set_of_actions.append(action_according_to_policy)

    update_values(set_of_states, set_of_actions)