def pick_optimal_action(self, state, printing=False): """ Compute the best action to take in a state. Note that if there are no legal actions, which is the case at the terminal state, you should return None. """ if state not in self.q_table: self.q_table[state] = { key: 0.0 for key in Action.get_all_actions() } max_value = max(self.q_table[state].values()) actions = [ key for key in self.q_table[state] if self.q_table[state][key] == max_value ] if printing: print(state) print(self.q_table[state]) print(state.__hash__()) return random.choice(actions)
def train(self, level='level-0', num_episodes=10): game = Game(level) discount = 0.8 alpha = 0.2 for i in range(num_episodes): current_game_state = deepcopy(game.initial_game_state) episode_done = False while not episode_done: if i % 50 == 0: print("Iteration number", i) action = self.pick_action(current_game_state) new_game_state, action_event = get_next_game_state_from_action(current_game_state, action.name) if action_event == ActionEvent.WON or action_event == ActionEvent.LOST: episode_done = True if action_event == ActionEvent.WON: print("Won!!") reward = calculate_reward_for_move(action_event) if current_game_state not in self.q_table: self.q_table[current_game_state] = {key: 0.0 for key in Action.get_all_actions()} self.q_table[current_game_state][action] = self.q_table[current_game_state][action] + alpha * (reward + (discount * self.compute_max_q_value(new_game_state)) - self.q_table[current_game_state][action]) current_game_state = new_game_state save_pickle('./q_table', self.q_table, True)
def pick_action(self, game_state): exploration_prob = 0.20 if exploration_prob > np.random.rand(): # Explore return np.random.choice(Action.get_all_actions()) else: # Exploit return self.pick_optimal_action(game_state)
def pick_action(self, game_state, i): exploration_prob = 4 / ((i + 1)**(1 / 2.2)) if exploration_prob > np.random.rand(): # Explore return np.random.choice(Action.get_all_actions()) else: # Exploit return self.pick_optimal_action(game_state)
def pick_action(game_state): # TODO: Epsilon greedy exploration_prob = 1.0 if exploration_prob > np.random.rand(): # Explore return np.random.choice(Action.get_all_actions()) else: # Exploit print("Exploit")
def compute_max_q_value(self, state): """ Returns max_action Q(state,action) where the max is over legal actions. Note that if there are no legal actions, which is the case at the terminal state, you should return a value of 0.0. """ if state not in self.q_table: self.q_table[state] = {key: 0.0 for key in Action.get_all_actions()} return max(self.q_table[state].values())
def __init__(self): self.rewards = [] # Fix reward structure self.states = [] # Fix state structure self.model = Sequential() self.model.add( Conv2D(filters=32, kernel_size=[4, 4], input_shape=(7, 20), activation='relu')) # TODO: Get shape dynamically # TODO: Batch Normalization? self.model.add(Flatten()) self.model.add(Dense(512)) output_layer_length = len(Action.get_all_actions()) self.model.add(Dense(output_layer_length)) # Output layer self.model.compile( optimizer='adam', loss='mse', )
def train(self, level, num_training_episodes, batch_size, gamma=0.9): initial_game_state = initialize_gamestate_from_file(level) tot_loss = {} memory = Memory(max_size=5000) for i in range(1, num_training_episodes): loss = 0. num_episode_steps = 0 done = False current_game_state = deepcopy(initial_game_state) while not done: if num_episode_steps > 1000: break action = self.pick_action(current_game_state, i) next_game_state, action_event = get_next_game_state_from_action( current_game_state, action.name) if action_event == ActionEvent.WON or action_event == ActionEvent.LOST: done = True if action_event == ActionEvent.WON: print("Won!!") else: print('lost') reward = calculate_reward_for_move(action_event) experience = Experience( current_state=self.convert_state_to_input( current_game_state), action=action, reward=reward, next_state=self.convert_state_to_input(next_game_state), done=done) memory.add(experience) batch = memory.get_mini_batch(batch_size=batch_size) # Dimensions of our observed states, ie, the input to our model. input_dim = batch[0].current_state.shape[1] x_train = np.zeros((min(memory.get_size(), batch_size), input_dim)) y_train = np.zeros( (x_train.shape[0], len(Action.get_all_actions()))) # Target Q-value sample: Experience for j, sample in enumerate(batch): y_target = self.model.predict(sample.current_state)[0] x_train[j:j + 1] = sample.current_state if sample.done: y_target[sample.action.value] = sample.reward else: y_target[sample.action. value] = sample.reward + gamma * np.max( self.model.predict(sample.next_state)) y_train[j] = y_target batch_loss = self.model.train_on_batch(x_train, np.asarray(y_train)) loss += batch_loss num_episode_steps += 1 current_game_state = deepcopy(next_game_state) print(i) print(loss / num_episode_steps) tot_loss[i] = (loss / num_episode_steps) if i % 500 == 0: self.model.save('./nn_model' + str(i) + '.h5') print(tot_loss) # plot_training_history(tot_loss) self.model.save('./nn_model.h5')
def pick_optimal_action(self, state): q = self.model.predict(self.convert_state_to_input(state)) return Action.get_all_actions()[np.argmax(q[0])]
def __init__(self, level): initial_game_state = initialize_gamestate_from_file(level) self.input_size = self.convert_state_to_input(initial_game_state).size self.num_actions = len(Action.get_all_actions()) self.model = self.init_model()