class DeepQLearner (BaseLearner): def __init__(self): super(DeepQLearner, self).__init__() self.discount_rate = 0.95 self.network = QNetwork() self.weights = [] def decide_action(self, new_state, possible_moves): if not self.last_state: return npr.choice(possible_moves) # compute Q-scores with forward-propagation list_Qscore = [] for move in possible_moves: list_Qscore.append(self.network.use_model(new_state, move)) # get the best action & Q-score from current state best_Qscore = max(list_Qscore) # best_move = possible_moves[list_Qscore.index(best_Qscore)] # update weights with back-propagation self.network.update_model(self.last_state, self.last_action, float(self.last_reward + self.discount_rate * best_Qscore)) return possible_moves[self.explorer.decide_action(self.epoch, np.asarray(list_Qscore))] def end_epoch(self, score): super(DeepQLearner, self).end_epoch(score) #save the network weights at this epoch if self.epoch % 1000 == 0: self.weights.append(self.network.get_all_weights())
class DeepQLearner (BaseLearner): def __init__(self): super(DeepQLearner, self).__init__() self.discount_rate = 0.95 self.network = QNetwork() self.database = ExperienceDatabase(100000) self.weights = [] def decide_action(self, new_state, possible_moves): # compute Q-scores with forward-propagation scores = np.zeros(len(possible_moves)) for i, move in enumerate(possible_moves): scores[i] = self.network.use_model(new_state, move) # train neural-network if self.last_state is not None: self.database.add(self.last_state, self.last_action, self.last_reward, new_state, possible_moves) past_last_state, past_last_action, past_last_reward, past_new_state, past_possible_moves = self.database.sample(1)[0] best_score = None for move in past_possible_moves: best_score = max(best_score, self.network.use_model(past_new_state, move)) # update weights with back-propagation self.network.update_model(past_last_state, past_last_action, float(past_last_reward + self.discount_rate * best_score)) return possible_moves[self.explorer.decide_action(self.epoch, scores)] def end_epoch(self, score): super(DeepQLearner, self).end_epoch(score) #save the network weights at this epoch if self.epoch % 1000 == 0: self.weights.append(self.network.get_all_weights())
class DeepQLearner (BaseLearner): def __init__(self): super(DeepQLearner, self).__init__() self.discount_rate = 0.95 self.network = QNetwork() self.weights = [] def decide_action(self, new_state, possible_moves): if not self.last_state: return npr.choice(possible_moves) valid_rotations = self.get_rotated_boards(new_state,possible_moves) # compute Q-scores with forward-propagation list_Qscore = [] for some_state in valid_rotations: list_Qscore.append(self.network.use_model(some_state)) # get the best action & Q-score from current state best_Qscore = max(list_Qscore) # best_move = possible_moves[list_Qscore.index(best_Qscore)] # update weights with back-propagation self.network.update_model((self.get_rotated_boards(self.last_state, [self.last_action])[0]), float(self.last_reward + self.discount_rate * best_Qscore)) return possible_moves[self.explorer.decide_action(self.epoch, np.asarray(list_Qscore))] def end_epoch(self, score): super(DeepQLearner, self).end_epoch(score) #save the network weights at this epoch if self.epoch % 1000 == 0: self.weights.append(self.network.get_all_weights()) def get_rotated_boards(self, this_state, which_rotations): #for which_rotations: #1 gives 180 degree turn #2 gives 90 degree turn clockwise (possible_move left for self.last_action for original state!!!) #3 gives 90 degree turn CCW #returns list of processed tuples rotated_boards = [] for move in which_rotations: if move == 0: rotated_boards.append(this_state) elif move == 1: new_board = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] map_dictionary = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] for i in range(16): new_board[i] = this_state[map_dictionary[i]] rotated_boards.append(tuple(new_board)) elif move == 2: new_board = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] map_dictionary = [12,8,4,0,13,9,5,1,14,10,6,2,15,11,7,3] for i in range(16): new_board[i] = this_state[map_dictionary[i]] rotated_boards.append(tuple(new_board)) elif move == 3: new_board = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] map_dictionary = [3,7,11,15,2,6,10,14,1,5,9,13,0,4,8,12] for i in range(16): new_board[i] = this_state[map_dictionary[i]] rotated_boards.append(tuple(new_board)) return rotated_boards