示例#1
0
    def pick_optimal_action(self, state, printing=False):
        """
          Compute the best action to take in a state.  Note that if there
          are no legal actions, which is the case at the terminal state,
          you should return None.
        """

        if state not in self.q_table:
            self.q_table[state] = {
                key: 0.0
                for key in Action.get_all_actions()
            }

        max_value = max(self.q_table[state].values())
        actions = [
            key for key in self.q_table[state]
            if self.q_table[state][key] == max_value
        ]

        if printing:
            print(state)
            print(self.q_table[state])
            print(state.__hash__())

        return random.choice(actions)
示例#2
0
    def train(self, level='level-0', num_episodes=10):
        game = Game(level)
        discount = 0.8
        alpha = 0.2

        for i in range(num_episodes):
            current_game_state = deepcopy(game.initial_game_state)

            episode_done = False
            while not episode_done:
                if i % 50 == 0:
                    print("Iteration number", i)
                action = self.pick_action(current_game_state)
                new_game_state, action_event = get_next_game_state_from_action(current_game_state, action.name)

                if action_event == ActionEvent.WON or action_event == ActionEvent.LOST:
                    episode_done = True
                    if action_event == ActionEvent.WON:
                        print("Won!!")

                reward = calculate_reward_for_move(action_event)

                if current_game_state not in self.q_table:
                    self.q_table[current_game_state] = {key: 0.0 for key in Action.get_all_actions()}

                self.q_table[current_game_state][action] = self.q_table[current_game_state][action] + alpha * (reward + (discount * self.compute_max_q_value(new_game_state)) - self.q_table[current_game_state][action])

                current_game_state = new_game_state

        save_pickle('./q_table', self.q_table, True)
示例#3
0
 def pick_action(self, game_state):
     exploration_prob = 0.20
     if exploration_prob > np.random.rand():
         # Explore
         return np.random.choice(Action.get_all_actions())
     else:
         # Exploit
         return self.pick_optimal_action(game_state)
示例#4
0
 def pick_action(self, game_state, i):
     exploration_prob = 4 / ((i + 1)**(1 / 2.2))
     if exploration_prob > np.random.rand():
         # Explore
         return np.random.choice(Action.get_all_actions())
     else:
         # Exploit
         return self.pick_optimal_action(game_state)
示例#5
0
def pick_action(game_state):
    # TODO: Epsilon greedy
    exploration_prob = 1.0
    if exploration_prob > np.random.rand():
        # Explore
        return np.random.choice(Action.get_all_actions())
    else:
        # Exploit
        print("Exploit")
示例#6
0
    def compute_max_q_value(self, state):
        """
          Returns max_action Q(state,action)
          where the max is over legal actions.  Note that if
          there are no legal actions, which is the case at the
          terminal state, you should return a value of 0.0.
        """

        if state not in self.q_table:
            self.q_table[state] = {key: 0.0 for key in Action.get_all_actions()}

        return max(self.q_table[state].values())
示例#7
0
    def __init__(self):
        self.rewards = []  # Fix reward structure
        self.states = []  # Fix state structure

        self.model = Sequential()

        self.model.add(
            Conv2D(filters=32,
                   kernel_size=[4, 4],
                   input_shape=(7, 20),
                   activation='relu'))  # TODO: Get shape dynamically

        # TODO: Batch Normalization?
        self.model.add(Flatten())
        self.model.add(Dense(512))

        output_layer_length = len(Action.get_all_actions())
        self.model.add(Dense(output_layer_length))  # Output layer

        self.model.compile(
            optimizer='adam',
            loss='mse',
        )
示例#8
0
    def train(self, level, num_training_episodes, batch_size, gamma=0.9):

        initial_game_state = initialize_gamestate_from_file(level)
        tot_loss = {}
        memory = Memory(max_size=5000)

        for i in range(1, num_training_episodes):

            loss = 0.
            num_episode_steps = 0

            done = False
            current_game_state = deepcopy(initial_game_state)

            while not done:
                if num_episode_steps > 1000:
                    break

                action = self.pick_action(current_game_state, i)
                next_game_state, action_event = get_next_game_state_from_action(
                    current_game_state, action.name)

                if action_event == ActionEvent.WON or action_event == ActionEvent.LOST:
                    done = True
                    if action_event == ActionEvent.WON:
                        print("Won!!")
                    else:
                        print('lost')

                reward = calculate_reward_for_move(action_event)

                experience = Experience(
                    current_state=self.convert_state_to_input(
                        current_game_state),
                    action=action,
                    reward=reward,
                    next_state=self.convert_state_to_input(next_game_state),
                    done=done)
                memory.add(experience)

                batch = memory.get_mini_batch(batch_size=batch_size)

                # Dimensions of our observed states, ie, the input to our model.
                input_dim = batch[0].current_state.shape[1]
                x_train = np.zeros((min(memory.get_size(),
                                        batch_size), input_dim))
                y_train = np.zeros(
                    (x_train.shape[0],
                     len(Action.get_all_actions())))  # Target Q-value

                sample: Experience
                for j, sample in enumerate(batch):
                    y_target = self.model.predict(sample.current_state)[0]

                    x_train[j:j + 1] = sample.current_state
                    if sample.done:
                        y_target[sample.action.value] = sample.reward
                    else:
                        y_target[sample.action.
                                 value] = sample.reward + gamma * np.max(
                                     self.model.predict(sample.next_state))
                    y_train[j] = y_target

                batch_loss = self.model.train_on_batch(x_train,
                                                       np.asarray(y_train))

                loss += batch_loss

                num_episode_steps += 1

                current_game_state = deepcopy(next_game_state)

            print(i)
            print(loss / num_episode_steps)

            tot_loss[i] = (loss / num_episode_steps)

            if i % 500 == 0:
                self.model.save('./nn_model' + str(i) + '.h5')

        print(tot_loss)

        # plot_training_history(tot_loss)

        self.model.save('./nn_model.h5')
示例#9
0
 def pick_optimal_action(self, state):
     q = self.model.predict(self.convert_state_to_input(state))
     return Action.get_all_actions()[np.argmax(q[0])]
示例#10
0
    def __init__(self, level):
        initial_game_state = initialize_gamestate_from_file(level)

        self.input_size = self.convert_state_to_input(initial_game_state).size
        self.num_actions = len(Action.get_all_actions())
        self.model = self.init_model()