示例#1
0
            epsilon = settings.initial_epsilon - (2 * episode /
                                                  float(settings.episodes))
        else:
            # Final epsilon reached, stop annealing.
            epsilon = settings.final_epsilon

        # Select action
        if (np.random.random() < epsilon):
            # Choose random action
            action = np.random.randint(0, 4)
        else:
            # Choose the action with the highest Q-value
            action = np.argmax(q_values)

        # Take action and observe reward and check if state is terminal
        _, reward, terminal = env.perform_action(action)

        # Save values for stats
        epsilon_arr.append(epsilon)
        reward_arr.append(reward)
        q_max_arr.append(q_max)

        # Get the new states Q-values
        new_state_row = env.actor_state_row()
        q_values_new = q_table.get_state_q(new_state_row)
        # Get max(Q(s',a')) to update Q(s,a)
        q_max_new = np.max(q_values_new)

        if not terminal:
            # Non-terminal state, update with reward + gamma * max(Q(s'a')
            update = reward + (settings.gamma * q_max_new)
示例#2
0
            epsilon = settings.initial_epsilon - (2 * episode /
                                                  float(settings.episodes))
        else:
            # Final epsilon reached, stop annealing.
            epsilon = settings.final_epsilon

        # Select action
        if (np.random.random() < epsilon):
            # Choose random action
            action = np.random.randint(0, 4)
        else:
            # Choose the action with the highest Q-value
            action = np.argmax(q_values)

        # Take action and observe new state and reward, check if state is terminal
        new_state, reward, terminal = env.perform_action(action)

        # Pop the oldest state
        new_states = np.delete(states, 0, 0)

        # Separate the other states into a tuple
        new_states_tuple = ()
        for i in range(len(new_states)):
            new_states_tuple += (new_states[i], )

        # Add new state to tuple and stack the new states
        new_states_tuple += (new_state.reshape(1, input_size), )
        new_states = np.stack(new_states_tuple)

        # Get the new state's Q-values
        q_values_new = rnn_network.predict(new_states)